/**
 * Copyright (c) 2024 Huawei Technologies Co., Ltd.
 * This file is a part of the CANN Open Software.
 * Licensed under CANN Open Software License Agreement Version 1.0 (the "License").
 * Please refer to the License for details. You may not use this file except in compliance with the License.
 * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED,
 * INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE.
 * See LICENSE in the root of the software repository for the full text of the License.
 */

/*!
 * \file copy_cube_in_using_ub.h
 * \brief
 */

#ifndef IMPL_MATMUL_MODULES_STAGE_COPY_CUBE_IN_COPY_CUBE_IN_USING_UB_H
#define IMPL_MATMUL_MODULES_STAGE_COPY_CUBE_IN_COPY_CUBE_IN_USING_UB_H

#include "../../matmul_module.h"
#include "../../matmul_param.h"
#include "copy_cube_in_intf.h"

namespace AscendC {
namespace Impl {
namespace Detail {


constexpr int32_t FIRST_16BIT_OFFSET_MM_API = 16;
constexpr int32_t SECOND_16BIT_OFFSET_MM_API = 32;
constexpr int32_t THIRD_16BIT_OFFSET_MM_API = 48;
constexpr int32_t REPEAT_BLOCK_NUM_MM_API = 8;
constexpr int32_t EACH_BLOCK_BYTES_MM_API = 32;
constexpr int32_t CACHE_LINE_SIZE_MM_API = 512;
constexpr int32_t TRANS_DATA_ARRAY_SIZE_MM_API = 16;
constexpr int32_t ANTI_QUANT_ALIGN_SIZE_MM_API = 32;
constexpr int32_t MAX_BLOCK_COUNT_SIZE_MM_API = 4095;
/*
    CopyCubeIn is considered entirely experimental.
    We retain the freedom to make incompatible changes, but do not guarantee the stability.
    CopyCubeIn is only for internal usage, does not support extension or customized specialization!
*/
template <typename IMPL, class INPUT_TYPE, const auto& MM_CFG>
class CopyCubeIn<IMPL, INPUT_TYPE, MM_CFG,
    enable_if_t<MatmulFeatureTrait<MM_CFG>::IsNeedUB() &&
    ((GetCopyCubeInType<INPUT_TYPE, MM_CFG>() == CopyCubeInType::NORMAL) ||
    (GetCopyCubeInType<INPUT_TYPE, MM_CFG>() == CopyCubeInType::MDL))>>
{
    MATMUL_USE_MODULE_ON(CubeInBuffer, INPUT_TYPE::TAG);
    MATMUL_USE_MODULE_ON(CopyCubeInParams, INPUT_TYPE::TAG);
    MATMUL_USE_MODULE_ON(MatmulTensorInfo, INPUT_TYPE::TAG);
    MATMUL_USE_MODULE(MatmulShapeInfo);
    MATMUL_USE_MODULE(MatmulShapeTiling);
    MATMUL_USE_MODULE(LocalWorkspace);
    MATMUL_USE_MODULE(MLoop);
    MATMUL_USE_MODULE(NLoop);
    MATMUL_USE_MODULE(KLoop);
    using TRANS_T = typename INPUT_TYPE::TRANS_T;
    using SRC_T = typename INPUT_TYPE::T;

public:
    inline __aicore__ CopyCubeIn() = default;
    inline __aicore__ ~CopyCubeIn() = default;

    __aicore__ inline void Init()
    {
        MATMUL_MODULE(CubeInBuffer)
            ->Init(MATMUL_MODULE(CopyCubeInParams)->GetBufferSize(), MATMUL_MODULE(CopyCubeInParams)->GetDepth());

        if constexpr (DoMatmulMDL(MM_CFG) && ToMatmulConfig(MM_CFG).enableL1CacheUB) {
            if (GetDepthL1CacheUB<INPUT_TYPE::TAG>() > 0) {
                GetTPipePtr()->InitBuffer(qidUBCache_, 1,
                                          GetDepthL1CacheUB<INPUT_TYPE::TAG>() *
                                              MATMUL_MODULE(CopyCubeInParams)->GetStepCol() *
                                              MATMUL_MODULE(CopyCubeInParams)->GetStepRow() *
                                              MATMUL_MODULE(CopyCubeInParams)->GetBufferSize() * sizeof(SRC_T));
            }
        }
    }

    __aicore__ inline void SetInput(const LocalTensor<SRC_T>& localMatrix, bool isTranspose)
    {
        // do Set Local Input here
        MATMUL_MODULE(MatmulTensorInfo)->SetLocalTensor(localMatrix, isTranspose);
        MATMUL_MODULE(CubeInBuffer)->Reset();
    }

    __aicore__ inline void SetInput(const GlobalTensor<SRC_T>& globalMatrix, bool isTranspose)
    {
        // do Set Global Input here
        MATMUL_MODULE(MatmulTensorInfo)->SetGlobalTensor(globalMatrix, isTranspose);
        MATMUL_MODULE(CubeInBuffer)->Reset();
    }

    __aicore__ inline void Reset()
    {
        MATMUL_MODULE(CubeInBuffer)->Reset();
    }

    template <typename ScheduleContext = int>
    __aicore__ inline LocalTensor<TRANS_T> LoadData(
        int32_t curRow, int32_t curCol, int32_t tileHeight, int32_t tileWidth, const ScheduleContext& context = {})
    {
        LocalTensor<TRANS_T> l1;
        if constexpr (DoMatmulMDL(MM_CFG)) {
            auto posL1 = GetIterIndex(curRow, curCol);
            auto bufferPos = MATMUL_MODULE(CopyCubeInParams)->GetBufferPos();
            if (MATMUL_MODULE(CubeInBuffer)->Hit(posL1, bufferPos)) {
                l1 = MATMUL_MODULE(CubeInBuffer)->GetBuffer(posL1, bufferPos);
            } else {
                l1 = MATMUL_MODULE(CubeInBuffer)->AllocTensor(bufferPos);
                if (IsTranspose()) {
                    CopyTileToCube<true>(l1, curCol, curRow, tileWidth, tileHeight);
                } else {
                    CopyTileToCube<false>(l1, curRow, curCol, tileHeight, tileWidth);
                }
                MATMUL_MODULE(CubeInBuffer)->EnQue(l1);
                MATMUL_MODULE(CubeInBuffer)->DeQue();
            }
        } else {
            auto posL1 = GetIterIndex(curRow, curCol);
            if (MATMUL_MODULE(CubeInBuffer)->Hit(posL1)) {
                l1 = MATMUL_MODULE(CubeInBuffer)->GetBuffer(posL1);
            } else {
                l1 = MATMUL_MODULE(CubeInBuffer)->AllocTensor(posL1);
                if (IsTranspose()) {
                    CopyTileToCube<true>(l1, curCol, curRow, tileWidth, tileHeight);
                } else {
                    CopyTileToCube<false>(l1, curRow, curCol, tileHeight, tileWidth);
                }
                MATMUL_MODULE(CubeInBuffer)->EnQue(l1);
                MATMUL_MODULE(CubeInBuffer)->DeQue();
            }
        }
        return l1;
    }

    __aicore__ inline void ClearLoadData(const LocalTensor<TRANS_T>& aMatrix = NULL_TENSOR<TRANS_T>,
        int32_t curRow = 0, int32_t curCol = 0)
    {
        if constexpr (DoMatmulMDL(MM_CFG)) {
            auto bufferPos = MATMUL_MODULE(CopyCubeInParams)->GetBufferPos();
            MATMUL_MODULE(CubeInBuffer)->FreeTensor(bufferPos);
        } else {
            auto posL1 = GetIterIndex(curRow, curCol);
            MATMUL_MODULE(CubeInBuffer)->FreeTensor(posL1, aMatrix);
        }
    }

    // Destroy
    __aicore__ inline void Destroy()
    {
        MATMUL_MODULE(CubeInBuffer)->Destroy();
    }

private:

    __aicore__ inline bool IsTranspose()
    {
        if constexpr(INPUT_TYPE::TAG == InputTypeTag::A) {
            return MATMUL_MODULE(MatmulShapeInfo)->IsTransposeA();
        } else {
            return MATMUL_MODULE(MatmulShapeInfo)->IsTransposeB();
        }
    }
 
    __aicore__ auto constexpr GetBaseUseN() {
        return MATMUL_MODULE(NLoop)->GetBaseShape();
    }
    
    __aicore__ auto constexpr GetBaseUseStepKb() {
        return MATMUL_MODULE(KLoop)->GetTileShapeB();
    }

    template <typename INPUT_TYPE_ALIAS = INPUT_TYPE>
    __aicore__ constexpr enable_if_t<INPUT_TYPE_ALIAS::TAG == InputTypeTag::A, int32_t> GetCopyHeight(int32_t i)
    {
        return MATMUL_MODULE(MLoop)->GetTileShapeOf(MATMUL_MODULE(MLoop)->GetOuterIdx() + i);
    }

    template <typename INPUT_TYPE_ALIAS = INPUT_TYPE>
    __aicore__ constexpr enable_if_t<INPUT_TYPE_ALIAS::TAG == InputTypeTag::B, int32_t> GetCopyHeight(int32_t i)
    {
        return MATMUL_MODULE(NLoop)->GetTileShapeOf(MATMUL_MODULE(NLoop)->GetOuterIdx() + i);
    }

    template <bool IS_TRANS = false, typename INPUT_TYPE_ALIAS = INPUT_TYPE>
    __aicore__ constexpr enable_if_t<INPUT_TYPE_ALIAS::TAG == InputTypeTag::A, int32_t>
    GetCopyWidth(int32_t i, int32_t baseWidth)
    {
        if constexpr (IS_TRANS) {
            return MATMUL_MODULE(MLoop)->GetTileShapeOf(MATMUL_MODULE(MLoop)->GetOuterIdx() + i);
        } else {
            return MATMUL_MODULE(KLoop)->GetTileShapeAOf(MATMUL_MODULE(KLoop)->GetOuterKaIdx() + i);
        }
    }

    template <bool IS_TRANS = false, typename INPUT_TYPE_ALIAS = INPUT_TYPE>
    __aicore__ constexpr enable_if_t<INPUT_TYPE_ALIAS::TAG == InputTypeTag::B, int32_t>
    GetCopyWidth(int32_t i, int32_t baseWidth)
    {
        return MATMUL_MODULE(NLoop)->GetTileShapeOf(MATMUL_MODULE(NLoop)->GetOuterIdx() + i);
    }

    template <typename INPUT_TYPE_ALIAS = INPUT_TYPE>
    __aicore__ constexpr enable_if_t<INPUT_TYPE_ALIAS::TAG == InputTypeTag::A, bool> IsBufferPosEnd(int32_t i)
    {
        return MATMUL_MODULE(MLoop)->GetOuterIdx() + i >= MATMUL_MODULE(MLoop)->GetOuterIter();
    }

    template <typename INPUT_TYPE_ALIAS = INPUT_TYPE>
    __aicore__ constexpr enable_if_t<INPUT_TYPE_ALIAS::TAG == InputTypeTag::A, bool> IsBufferPosEnd()
    {
        return MATMUL_MODULE(MLoop)->GetOuterIdx() == MATMUL_MODULE(MLoop)->GetOuterIter() - 1;
    }

    template <typename INPUT_TYPE_ALIAS = INPUT_TYPE>
    __aicore__ constexpr enable_if_t<INPUT_TYPE_ALIAS::TAG == InputTypeTag::B, bool> IsBufferPosEnd(int32_t i)
    {
        return MATMUL_MODULE(NLoop)->GetOuterIdx() + i >= MATMUL_MODULE(NLoop)->GetOuterIter();
    }

    template <typename INPUT_TYPE_ALIAS = INPUT_TYPE>
    __aicore__ constexpr enable_if_t<INPUT_TYPE_ALIAS::TAG == InputTypeTag::B, bool> IsBufferPosEnd()
    {
        return MATMUL_MODULE(NLoop)->GetOuterIdx() == MATMUL_MODULE(NLoop)->GetOuterIter() - 1;
    }

    template <typename INPUT_TYPE_ALIAS = INPUT_TYPE>
    __aicore__ constexpr enable_if_t<INPUT_TYPE_ALIAS::TAG == InputTypeTag::A, bool> IsBufferKPosEnd(int32_t i)
    {
        auto tiling = MATMUL_MODULE(MatmulShapeTiling)->GetTiling();
        int32_t stepKaIter = Ceil(
            MATMUL_MODULE(MatmulShapeInfo)->GetSingleCoreK(), tiling.GetBaseK() * tiling.GetStepKa());
        return MATMUL_MODULE(KLoop)->GetOuterKaIdx() + i >= stepKaIter;
    }

    template <typename INPUT_TYPE_ALIAS = INPUT_TYPE>
    __aicore__ constexpr enable_if_t<INPUT_TYPE_ALIAS::TAG == InputTypeTag::A, bool> IsBufferKPosEnd()
    {
        auto tiling = MATMUL_MODULE(MatmulShapeTiling)->GetTiling();
        int32_t stepKaIter = Ceil(
            MATMUL_MODULE(MatmulShapeInfo)->GetSingleCoreK(), tiling.GetBaseK() * tiling.GetStepKa());
        return MATMUL_MODULE(KLoop)->GetOuterKaIdx() == stepKaIter - 1;
    }

    template <typename INPUT_TYPE_ALIAS = INPUT_TYPE>
    __aicore__ constexpr enable_if_t<INPUT_TYPE_ALIAS::TAG == InputTypeTag::B, bool> IsBufferKPosEnd(int32_t i)
    {
        auto tiling = MATMUL_MODULE(MatmulShapeTiling)->GetTiling();
        int32_t stepKbIter = Ceil(
            MATMUL_MODULE(MatmulShapeInfo)->GetSingleCoreK(), tiling.GetBaseK() * tiling.GetStepKb());
        return MATMUL_MODULE(KLoop)->GetOuterKbIdx() + i >= stepKbIter;
    }

    template <typename INPUT_TYPE_ALIAS = INPUT_TYPE>
    __aicore__ constexpr enable_if_t<INPUT_TYPE_ALIAS::TAG == InputTypeTag::B, bool> IsBufferKPosEnd()
    {
        auto tiling = MATMUL_MODULE(MatmulShapeTiling)->GetTiling();
        int32_t stepKbIter = Ceil(
            MATMUL_MODULE(MatmulShapeInfo)->GetSingleCoreK(), tiling.GetBaseK() * tiling.GetStepKb());
        return MATMUL_MODULE(KLoop)->GetOuterKaIdx() == stepKbIter - 1;
    }

    template <typename INPUT_TYPE_ALIAS = INPUT_TYPE>
    __aicore__ constexpr enable_if_t<INPUT_TYPE_ALIAS::TAG == InputTypeTag::A, bool> IsL1KFullLoad() const
    {
        return MATMUL_CONST_PARAM_VAR.isA1KFullLoad_;
    }

    template <typename INPUT_TYPE_ALIAS = INPUT_TYPE>
    __aicore__ constexpr enable_if_t<INPUT_TYPE_ALIAS::TAG == InputTypeTag::B, bool> IsL1KFullLoad() const
    {
        return MATMUL_CONST_PARAM_VAR.isB1KFullLoad_;
    }

    template <typename INPUT_TYPE_ALIAS = INPUT_TYPE>
    __aicore__ constexpr enable_if_t<INPUT_TYPE_ALIAS::TAG == InputTypeTag::A, int32_t>
    GetIterIndexInner(int32_t curRow, int32_t curCol)
    {
        if constexpr (DoMatmulNorm(MM_CFG) || DoMatmulIBShareNorm(MM_CFG)) {
            auto& var = MATMUL_PARAM_VAR;
            if (MATMUL_MODULE(MatmulShapeTiling)->GetTiling().GetIterateOrder() ==
                static_cast<int>(IterateOrder::ORDER_M)) {
                return curCol;
            } else {
                return (curRow * MATMUL_MODULE(MatmulShapeInfo)->GetKIter() + curCol) %
                       (MATMUL_MODULE(MatmulShapeTiling)->GetTiling().GetStepM() * MATMUL_MODULE(MatmulShapeInfo)->GetKIter());
            }
        } else if constexpr (DoMatmulSpecialBasicBlock(MM_CFG)) {
            auto& var = MATMUL_PARAM_VAR;
            if (MATMUL_MODULE(MatmulShapeTiling)->GetTiling().GetIterateOrder() ==
                static_cast<int>(IterateOrder::ORDER_M)) {
                return curCol;
            } else {
                return (curRow * var.kIter_ + curCol) %
                       (ToMatmulConfig(MM_CFG).stepM * ToMatmulConfig(MM_CFG).singleCoreK /
                        ToMatmulConfig(MM_CFG).basicK);
            }
        } else {
            return 0;
        }
    }

    template <typename INPUT_TYPE_ALIAS = INPUT_TYPE>
    __aicore__ constexpr enable_if_t<INPUT_TYPE_ALIAS::TAG == InputTypeTag::B, int32_t>
    GetIterIndexInner(int32_t curRow, int32_t curCol)
    {
        if constexpr (DoMatmulNorm(MM_CFG) || DoMatmulIBShareNorm(MM_CFG)) {
            auto& var = MATMUL_PARAM_VAR;
            if (MATMUL_MODULE(MatmulShapeTiling)->GetTiling().GetIterateOrder() ==
                static_cast<int>(IterateOrder::ORDER_M)) {
                return (curRow + curCol * MATMUL_MODULE(MatmulShapeInfo)->GetKIter()) %
                       (MATMUL_MODULE(MatmulShapeTiling)->GetTiling().GetStepN() * MATMUL_MODULE(MatmulShapeInfo)->GetKIter());
            } else {
                return curRow;
            }
        } else if (DoMatmulSpecialBasicBlock(MM_CFG)) {
            auto& var = MATMUL_PARAM_VAR;
            if (MATMUL_MODULE(MatmulShapeTiling)->GetTiling().GetIterateOrder() ==
                static_cast<int>(IterateOrder::ORDER_M)) {
                return (curRow + curCol * var.kIter_) % ToMatmulConfig(MM_CFG).stepN *
                       ToMatmulConfig(MM_CFG).singleCoreK / ToMatmulConfig(MM_CFG).basicK;
            } else {
                return curRow;
            }
        } else {
            return 0;
        }
    }

    template <const auto& MM_CFG_ALIAS = MM_CFG>
    __aicore__ constexpr enable_if_t<DoMatmulNorm(MM_CFG_ALIAS), int32_t> GetIterIndex(int32_t curRow, int32_t curCol)
    {
        if constexpr (GetCubeInBufferType<INPUT_TYPE, MM_CFG>() == CubeInBufferType::SINGLE_BUFFER) {
            return 0;
        } else if constexpr (GetCubeInBufferType<INPUT_TYPE, MM_CFG>() == CubeInBufferType::NORMAL ||
                             GetCubeInBufferType<INPUT_TYPE, MM_CFG>() == CubeInBufferType::SINGLE_GLOBAL_BUFFER ||
                             GetCubeInBufferType<INPUT_TYPE, MM_CFG>() == CubeInBufferType::DOUBLE_GLOBAL_BUFFER) {
            return GetIterIndexInner(curRow, curCol);
        }
    }

    template <typename INPUT_TYPE_ALIAS = INPUT_TYPE>
    __aicore__ inline static enable_if_t<INPUT_TYPE_ALIAS::TAG == InputTypeTag::A, int32_t> GetCurKPos(int32_t curRow,
                                                                                                       int32_t curCol)
    {
        return curCol;
    }

    template <typename INPUT_TYPE_ALIAS = INPUT_TYPE>
    __aicore__ inline static enable_if_t<INPUT_TYPE_ALIAS::TAG == InputTypeTag::B, int32_t> GetCurKPos(int32_t curRow,
                                                                                                       int32_t curCol)
    {
        return curRow;
    }

    template <const auto& MM_CFG_ALIAS = MM_CFG>
    __aicore__ constexpr enable_if_t<DoMatmulMDL(MM_CFG_ALIAS), int32_t> GetIterIndex(int32_t curRow, int32_t curCol)
    {
        return GetCurKPos(curRow, curCol) % GetMajorCacheNum();
    }

    __aicore__ inline int32_t GetMajorCacheNum()
    {
        if constexpr (INPUT_TYPE::TAG == InputTypeTag::A) {
            return MATMUL_MODULE(MatmulShapeTiling)->GetTiling().GetStepKa();
        } else {
            return MATMUL_MODULE(MatmulShapeTiling)->GetTiling().GetStepKb();
        }
    }

    template <InputTypeTag TAG>
    __aicore__ inline int32_t GetDepthL1CacheUB()
    {
        if constexpr (TAG == InputTypeTag::A) {
            return MATMUL_MODULE(MatmulShapeTiling)->GetTiling().GetDepthAL1CacheUB();
        } else {
            return MATMUL_MODULE(MatmulShapeTiling)->GetTiling().GetDepthBL1CacheUB();
        }
    }

    template <bool IS_TRANS = false>
    __aicore__ inline bool CopyTileToCube(const LocalTensor<TRANS_T>& aMatrix, int curRow, int curCol,
                                          int tileHeight, int tileWidth)
    {
        if constexpr (PhyPosIsUB(INPUT_TYPE::pos)) {
            return CopyTileToCubeFromUB<IS_TRANS>(aMatrix, curRow, curCol, tileHeight, tileWidth);
        } else {
            return CopyTileToCubeFromGM<IS_TRANS>(aMatrix, curRow, curCol, tileHeight, tileWidth);
        }
    }

    template <bool IS_TRANS = false>
    __aicore__ inline bool CopyTileToCubeFromGM(const LocalTensor<TRANS_T>& aMatrix, int curRow, int curCol,
                                                int tileHeight, int tileWidth)
    {
        if constexpr (INPUT_TYPE::format == CubeFormat::ND) {
            GlobalTensor<SRC_T> aGlobal;
            aGlobal.SetGlobalBuffer(MATMUL_MODULE(MatmulTensorInfo)->GetGlobalTensor().address_);
            if constexpr (INPUT_TYPE::TAG == InputTypeTag::B && IsSameTypeV<TRANS_T, int8_t> &&
                          IsSameTypeV<SRC_T, int8_t>) {
                if (!IsTranspose()) {
                    CopyND2NZWithTransData<IS_TRANS>(aMatrix, aGlobal, curRow, curCol, tileHeight,
                                                     tileWidth);
                } else {
                    CopyTileToCubeFromGMAndND<IS_TRANS>(aMatrix, aGlobal, curRow, curCol,
                        tileHeight, tileWidth);
                }
            } else {
                CopyTileToCubeFromGMAndND<IS_TRANS>(aMatrix, aGlobal, curRow, curCol,
                    tileHeight, tileWidth);
            }
        } else if constexpr (INPUT_TYPE::format == CubeFormat::NZ) {
            GlobalTensor<TRANS_T> aGlobal;
            aGlobal.SetGlobalBuffer(MATMUL_MODULE(MatmulTensorInfo)->GetGlobalTensor().address_);
            if constexpr (INPUT_TYPE::TAG == InputTypeTag::B && IsSameTypeV<TRANS_T, int8_t> &&
                          IsSameTypeV<SRC_T, int8_t>) {
                if (!IsTranspose()) {
                    CopyNZ2NZWithTransData<IS_TRANS>(aMatrix, aGlobal, curRow, curCol, tileHeight,
                                                     tileWidth);
                } else {
                    CopyNZ2NZ(aMatrix, aGlobal, curRow * MATMUL_MODULE(CopyCubeInParams)->template GetBaseHeight<IS_TRANS>(),
                              curCol * MATMUL_MODULE(CopyCubeInParams)->template GetBaseWidth<IS_TRANS>(), tileHeight,
                              tileWidth, MATMUL_MODULE(CopyCubeInParams)->template GetOrgHeight<IS_TRANS>());
                }
            } else {
                CopyNZ2NZ(aMatrix, aGlobal, curRow * MATMUL_MODULE(CopyCubeInParams)->template GetBaseHeight<IS_TRANS>(),
                          curCol * MATMUL_MODULE(CopyCubeInParams)->template GetBaseWidth<IS_TRANS>(), tileHeight, tileWidth,
                          MATMUL_MODULE(CopyCubeInParams)->template GetOrgHeight<IS_TRANS>());
            }
        } else if constexpr (INPUT_TYPE::format == CubeFormat::VECTOR) {
            if (MATMUL_MODULE(CopyCubeInParams)->IsKRowDirec()) {
                return false;
            }
            GlobalTensor<TRANS_T> aGlobal;
            aGlobal.SetGlobalBuffer(MATMUL_MODULE(MatmulTensorInfo)->GetGlobalTensor().address_);
            CopyVector2A1(aMatrix, aGlobal, curCol * MATMUL_MODULE(CopyCubeInParams)->template GetBaseWidth<IS_TRANS>(),
                          CeilT<int32_t>(tileWidth, c0Size_));
        } else {
            return false;
        }
        return true;
    }

    template <bool IS_TRANS = false>
    __aicore__ inline void CopyTileToCubeFromGMAndND(const LocalTensor<TRANS_T>& aMatrix,
                                                     GlobalTensor<SRC_T>& aGlobal,
                                                     int curRow, int curCol, int tileHeight, int tileWidth)
    {
        if constexpr (ToMatmulConfig(MM_CFG).enVecND2NZ) {
            CopyND2NZWithVecOp<IS_TRANS>(aMatrix, aGlobal, curRow, curCol, tileHeight, tileWidth);
        } else {
            CopyND2NZOnTheFly(aMatrix, aGlobal, curRow * MATMUL_MODULE(CopyCubeInParams)->template GetBaseHeight<IS_TRANS>(),
                curCol * MATMUL_MODULE(CopyCubeInParams)->template GetBaseWidth<IS_TRANS>(), tileHeight, tileWidth,
                MATMUL_MODULE(CopyCubeInParams)->template GetOrgWidth<IS_TRANS>());
        }
    }

    template <bool IS_TRANS = false>
    __aicore__ inline void CopyND2NZWithVecOp(const LocalTensor<TRANS_T>& aMatrix, const GlobalTensor<SRC_T>& src,
                                              int curRow, int curCol, int tileHeight, int tileWidth)
    {
        if constexpr (ToMatmulConfig(MM_CFG).enableL1CacheUB) {
            static_assert(DoMatmulMDL(MM_CFG), "Only MDL version support L1CacheUB.");
        }
        CopyND2NZ<IS_TRANS>(aMatrix, src, curRow * MATMUL_MODULE(CopyCubeInParams)->template GetBaseHeight<IS_TRANS>(),
            curCol * MATMUL_MODULE(CopyCubeInParams)->template GetBaseWidth<IS_TRANS>(), tileHeight, tileWidth,
            MATMUL_MODULE(CopyCubeInParams)->template GetOrgWidth<IS_TRANS>());
    }

    template <bool IS_TRANS = false>
    __aicore__ inline bool CopyTileToCubeFromUB(const LocalTensor<TRANS_T>& aMatrix, int curRow, int curCol,
                                                int tileHeight, int tileWidth)
    {
        if constexpr (INPUT_TYPE::format == CubeFormat::ND) {
            if constexpr (INPUT_TYPE::TAG == InputTypeTag::B && IsSameTypeV<TRANS_T, int8_t> &&
                          IsSameTypeV<SRC_T, int8_t>) {
                if (!IsTranspose()) {
                    LocalTensor<SRC_T> leftMatrix;
                    leftMatrix.SetAddr(MATMUL_MODULE(MatmulTensorInfo)->GetLocalTensor().address_);
                    CopyND2NZWithTransData<IS_TRANS>(aMatrix, leftMatrix, curRow, curCol, tileHeight,
                                                     tileWidth);
                } else {
                    if constexpr (ToMatmulConfig(MM_CFG).enVecND2NZ) {
                        return false;
                    } else {
                        CopyTileToCubeFromUBAndND<IS_TRANS>(aMatrix, curRow, curCol,
                            tileHeight, tileWidth);
                    }
                }
            } else {
                if constexpr (ToMatmulConfig(MM_CFG).enVecND2NZ) {
                    return false;
                } else {
                    CopyTileToCubeFromUBAndND<IS_TRANS>(aMatrix, curRow, curCol,
                        tileHeight, tileWidth);
                }
            }
        } else if constexpr (INPUT_TYPE::format == CubeFormat::NZ) {
            LocalTensor<SRC_T> leftMatrix;
            leftMatrix.SetAddr(MATMUL_MODULE(MatmulTensorInfo)->GetLocalTensor().address_);
            if constexpr (INPUT_TYPE::TAG == InputTypeTag::B && IsSameTypeV<TRANS_T, int8_t> &&
                          IsSameTypeV<SRC_T, int8_t>) {
                if (!IsTranspose()) {
                    CopyNZ2NZWithTransData<IS_TRANS>(aMatrix, leftMatrix, curRow, curCol, tileHeight,
                                                     tileWidth);
                } else {
                    CopyNZ2NZ(aMatrix, leftMatrix,
                              curRow * MATMUL_MODULE(CopyCubeInParams)->template GetBaseHeight<IS_TRANS>(),
                              curCol * MATMUL_MODULE(CopyCubeInParams)->template GetBaseWidth<IS_TRANS>(), tileHeight,
                              tileWidth, MATMUL_MODULE(CopyCubeInParams)->template GetOrgHeight<IS_TRANS>());
                }
            } else {
                CopyNZ2NZ(aMatrix, leftMatrix, curRow * MATMUL_MODULE(CopyCubeInParams)->template GetBaseHeight<IS_TRANS>(),
                          curCol * MATMUL_MODULE(CopyCubeInParams)->template GetBaseWidth<IS_TRANS>(), tileHeight, tileWidth,
                          MATMUL_MODULE(CopyCubeInParams)->template GetOrgHeight<IS_TRANS>());
            }
        } else if constexpr (INPUT_TYPE::format == CubeFormat::VECTOR) {
            if (MATMUL_MODULE(CopyCubeInParams)->IsKRowDirec()) {
                return false;
            }
            LocalTensor<SRC_T> leftMatrix;
            leftMatrix.SetAddr(MATMUL_MODULE(MatmulTensorInfo)->GetLocalTensor().address_);
            CopyVector2A1(aMatrix, leftMatrix, curCol * MATMUL_MODULE(CopyCubeInParams)->template GetBaseWidth<IS_TRANS>(),
                          CeilT<int32_t>(tileWidth, c0Size_));
        } else {
            return false;
        }
        return true;
    }

    template <bool IS_TRANS = false>
    __aicore__ inline void CopyTileToCubeFromUBAndND(const LocalTensor<TRANS_T>& aMatrix,
                                                     int curRow, int curCol,
                                                     int tileHeight, int tileWidth)
    {
        LocalTensor<SRC_T> leftMatrix;
        leftMatrix.SetAddr(MATMUL_MODULE(MatmulTensorInfo)->GetLocalTensor().address_);
        CopyND2NZOnTheFly(
            aMatrix, leftMatrix, curRow * MATMUL_MODULE(CopyCubeInParams)->template GetBaseHeight<IS_TRANS>(),
            curCol * MATMUL_MODULE(CopyCubeInParams)->template GetBaseWidth<IS_TRANS>(), tileHeight, tileWidth,
            MATMUL_MODULE(CopyCubeInParams)->template GetOrgWidth<IS_TRANS>());
    }

    __aicore__ inline void CopyNZ2NZ(const LocalTensor<TRANS_T>& dst, const GlobalTensor<TRANS_T>& src, const int row,
                                     const int col, const int height, const int width, const int gRow)
    {
        ASCENDC_ASSERT((gRow >= height), {
            KERNEL_LOG(
                KERNEL_ERROR,
                "NZ2NZ height larger than origin matrix height, gRow is %d, which should be no less than height %d.",
                gRow, height);
        });
        int alignedGRow = CeilAlignT<int32_t>(gRow, BLOCK_CUBE);
        int64_t srcOffset = (int64_t)row * (int64_t)c0Size_ + (int64_t)col * (int64_t)alignedGRow;
        // height direction need to be 16 aligned
        auto alignHeight = CeilAlignT<int32_t>(height, BLOCK_CUBE);
        int blockLen = alignHeight * c0Size_ * sizeof(TRANS_T) / ONE_BLK_SIZE;
        int srcStride = (alignedGRow - alignHeight) * (c0Size_ * sizeof(TRANS_T) / ONE_BLK_SIZE);

        if (srcStride >= UINT16_MAX) {
            for (int i = 0; i < CeilT<int32_t>(width, c0Size_); ++i) {
                DataCopy(dst[i * alignHeight * c0Size_], src[srcOffset + i * gRow * c0Size_],
                         { 1, static_cast<uint16_t>(blockLen), 0, 0 });
            }
        } else {
            uint16_t nburst = CeilT<int32_t>(width, c0Size_);
            int dstStride = 0;
            DataCopy(dst, src[srcOffset],
                     { nburst, static_cast<uint16_t>(blockLen), static_cast<uint16_t>(srcStride),
                       static_cast<uint16_t>(dstStride) });
        }
    };

    __aicore__ inline void CopyNZ2NZ(const LocalTensor<TRANS_T>& dst, const LocalTensor<TRANS_T>& src, const int row,
                                     const int col, const int height, const int width, const int gRow)
    {
        ASCENDC_ASSERT((gRow >= height), {
            KERNEL_LOG(KERNEL_ERROR, "gRow is %d, which should be no less than height %d.", gRow, height);
        });
        int srcOffset = row * c0Size_ + col * gRow;
        // height direction need to be 16 aligned
        auto alignHeight = (height + 15) / 16 * 16;
        int blockLen = alignHeight * c0Size_ * sizeof(TRANS_T) / ONE_BLK_SIZE;
        int srcStride = (gRow - alignHeight) * (c0Size_ * sizeof(TRANS_T) / ONE_BLK_SIZE);

        if (srcStride >= UINT16_MAX) {
            for (int i = 0; i < width / c0Size_; ++i) {
                DataCopy(dst[i * alignHeight * c0Size_], src[srcOffset + i * gRow * c0Size_],
                         { 1, static_cast<uint16_t>(blockLen), 0, 0 });
            }
        } else {
            DataCopy(dst, src[srcOffset],
                     { static_cast<uint16_t>(width / c0Size_), static_cast<uint16_t>(blockLen),
                       static_cast<uint16_t>(srcStride), 0 });
        }
    };

    __aicore__ inline void CopyVector2A1(const LocalTensor<TRANS_T>& dst, GlobalTensor<TRANS_T>& src, const int col,
                                         const int blockLen)
    {
        ASCENDC_ASSERT((col >= 0), { KERNEL_LOG(KERNEL_ERROR, "col is %d, which should be no less than 0.", col); });
        ASCENDC_ASSERT((INPUT_TYPE::format == CubeFormat::VECTOR),
                       { KERNEL_LOG(KERNEL_ERROR, "INPUT_TYPE::format should be CubeFormat::VECTOR."); });

        DataCopyParams dataCopyInfo;
        dataCopyInfo.blockCount = 1;
        dataCopyInfo.blockLen = blockLen;
        dataCopyInfo.srcStride = 0;
        dataCopyInfo.dstStride = 0;
        DataCopyEnhancedParams enhancedParams;
        enhancedParams.blockMode = BlockMode::BLOCK_MODE_VECTOR;
        DataCopy(dst, src[col], dataCopyInfo, enhancedParams);
        return;
    };

    __aicore__ inline void CopyVector2A1(const LocalTensor<TRANS_T>& dst, LocalTensor<TRANS_T>& src, const int col,
                                         const int blockLen)
    {
        ASCENDC_ASSERT((col >= 0), { KERNEL_LOG(KERNEL_ERROR, "col is %d, which should be no less than 0.", col); });
        ASCENDC_ASSERT((INPUT_TYPE::format == CubeFormat::VECTOR),
                       { KERNEL_LOG(KERNEL_ERROR, "INPUT_TYPE::format should be CubeFormat::VECTOR."); });

        DataCopyParams dataCopyInfo;
        dataCopyInfo.blockCount = 1;
        dataCopyInfo.blockLen = blockLen;
        dataCopyInfo.srcStride = 0;
        dataCopyInfo.dstStride = 0;
        DataCopy(dst, src[col], dataCopyInfo);
        return;
    };

    template <bool IS_TRANS = false>
    __aicore__ inline void CopyND2NZ(const LocalTensor<TRANS_T>& dst, const GlobalTensor<SRC_T>& src, const int row,
                                     const int col, const int height, const int width, const int gCol)
    {
        int calcWidth = CeilT(width, c0Size_);
        bool isBankConflict = calcWidth * EACH_BLOCK_BYTES_MM_API % CACHE_LINE_SIZE_MM_API == 0 &&
                              calcWidth < EACH_BLOCK_BYTES_MM_API ? true : false;
        int c0Size = c0Size_;
        if constexpr (IsSameTypeV<TRANS_T, half> && IsSameTypeV<SRC_T, int8_t>) {
            c0Size = 32;
        }
        int padWidth = isBankConflict ? Ceil(width, c0Size) + 1 : Ceil(width, c0Size);
        int size = MATMUL_MODULE(MatmulShapeTiling)->GetTiling().GetTransLength();
        if constexpr (IsSameTypeV<TRANS_T, half> && IsSameTypeV<SRC_T, int8_t>) {
            size = Ceil(height, c0Size) * padWidth * c0Size * c0Size / AuxGetFactor<TRANS_T>();
        }

        LocalTensor<SRC_T> transTensor;
        transTensor = MATMUL_MODULE(LocalWorkspace)->GetWorkspaceWithOffset(0).template ReinterpretCast<SRC_T>();
        transTensor.SetSize(size);
        LocalTensor<TRANS_T> trans;
        trans = MATMUL_MODULE(LocalWorkspace)->GetWorkspaceWithOffset(
            MATMUL_MODULE(MatmulShapeTiling)->GetTiling().GetTransLength()).template ReinterpretCast<TRANS_T>();
        trans.SetSize(size);
        int64_t srcOffset = (int64_t)row * (int64_t)gCol + (int64_t)col;

        int calcHigh = CeilT<int32_t>(height, BLOCK_CUBE);
        auto enQueEvtID = GetTPipePtr()->FetchEventID(HardEvent::V_MTE2);
        if constexpr (!ToMatmulConfig(MM_CFG).enableL1CacheUB) {
            SetFlag<HardEvent::V_MTE2>(enQueEvtID);
            WaitFlag<HardEvent::V_MTE2>(enQueEvtID);
        }
        if constexpr (ToMatmulConfig(MM_CFG).enableL1CacheUB) {
            if constexpr (INPUT_TYPE::TAG == InputTypeTag::A) {
                calcWidth = GetANDBlockFromGM<IS_TRANS>(transTensor, src, row, col, height, width, gCol, isBankConflict);
            } else {
                calcWidth = GetBNDBlockFromGM<IS_TRANS>(transTensor, src, row, col, height, width, gCol, isBankConflict);
            }
        } else {
            calcWidth = CopyNDBlock(transTensor, src, srcOffset, height, width, gCol, isBankConflict);
        }

        if constexpr (IsSameTypeV<TRANS_T, half> && IsSameTypeV<SRC_T, int8_t>) {
            static_assert(DoMatmulMDL(MM_CFG), "Only MDL version support AntiQuant.");
            if (!IsTranspose()) {
                enQueEvtID = GetTPipePtr()->FetchEventID(HardEvent::MTE2_S);
                SetFlag<HardEvent::MTE2_S>(enQueEvtID);
                WaitFlag<HardEvent::MTE2_S>(enQueEvtID);
            }
            AntiQuantCompute(trans, transTensor, isBankConflict);
            PipeBarrier<PIPE_V>();
            constexpr int32_t padBlock = 2;
            int32_t padWidth = isBankConflict ? calcWidth + padBlock : calcWidth;
            // update fp16 padwidth
            (const_cast<LocalTensor<TRANS_T>&>(dst)).SetSize(size);
            SetMaskNorm();
            NDPadZeros(trans, height, padWidth, gCol, width, isBankConflict);
            LocalTensor<TRANS_T> nzTensor;
            nzTensor = MATMUL_MODULE(LocalWorkspace)->GetWorkspaceWithOffset(0).template ReinterpretCast<TRANS_T>();
            nzTensor.SetSize(size);
            PipeBarrier<PIPE_V>();
            NDTrans2NZ(nzTensor, trans, calcHigh, calcWidth, isBankConflict);
            enQueEvtID = static_cast<event_t>(GetTPipePtr()->FetchEventID(HardEvent::V_MTE3));
            SetFlag<HardEvent::V_MTE3>(enQueEvtID);
            WaitFlag<HardEvent::V_MTE3>(enQueEvtID);
            DataCopy(dst, nzTensor, size);
            enQueEvtID = GetTPipePtr()->FetchEventID(HardEvent::MTE3_MTE2);
            SetFlag<HardEvent::MTE3_MTE2>(enQueEvtID);
            WaitFlag<HardEvent::MTE3_MTE2>(enQueEvtID);
        } else {
            int padWidth = isBankConflict ? calcWidth + 1 : calcWidth;
            int size = calcHigh * padWidth * BLOCK_CUBE * c0Size_ / AuxGetFactor<TRANS_T>();
            transTensor.SetSize(size);
            trans.SetSize(size);
            (const_cast<LocalTensor<TRANS_T>&>(dst)).SetSize(size);
            NDPadZeros(transTensor, height, padWidth, gCol, width, isBankConflict);
            NDTrans2NZ(trans, transTensor, calcHigh, calcWidth, isBankConflict);
            enQueEvtID = static_cast<event_t>(GetTPipePtr()->FetchEventID(HardEvent::V_MTE3));
            SetFlag<HardEvent::V_MTE3>(enQueEvtID);
            WaitFlag<HardEvent::V_MTE3>(enQueEvtID);
            DataCopy(dst, trans, size);
            enQueEvtID = GetTPipePtr()->FetchEventID(HardEvent::MTE3_V);
            SetFlag<HardEvent::MTE3_V>(enQueEvtID);
            WaitFlag<HardEvent::MTE3_V>(enQueEvtID);
        }
        return;
    };

    template <bool IS_TRANS = false>
    __aicore__ inline int32_t GetANDBlockFromGM(const LocalTensor<SRC_T>& transTensor, const GlobalTensor<SRC_T>& src,
                                                const int row, const int col, const int height, const int width,
                                                const int gCol, const bool isBankConflict)
    {
        auto enQueEvtID = GetTPipePtr()->FetchEventID(HardEvent::MTE1_MTE2);
        SetFlag<HardEvent::MTE1_MTE2>(enQueEvtID);
        WaitFlag<HardEvent::MTE1_MTE2>(enQueEvtID);
        int64_t srcOffset = (int64_t)row * (int64_t)gCol + (int64_t)col;

        uint32_t cacheA1Size =
            MATMUL_MODULE(CopyCubeInParams)->GetStepCol() * MATMUL_MODULE(CopyCubeInParams)->GetStepRow() * MATMUL_MODULE(CopyCubeInParams)->GetBufferSize();
        int calcWidth = CeilT(width, c0Size_);
        if (cache2UBProc_ == 0 || cache2UBProc_ >= GetDepthL1CacheUB<INPUT_TYPE::TAG>()) {
            if (cache2UBProc_ == 0) {
                cacheHead2UB_ = qidUBCache_.template AllocTensor<TRANS_T>();
            } else {
                qidUBCache_.FreeTensor(cacheHead2UB_);
                cacheHead2UB_ = qidUBCache_.template AllocTensor<TRANS_T>(); // To use que to insert events
            }
            if (IsL1KFullLoad()) {
                for (int i = 0; i < GetDepthL1CacheUB<INPUT_TYPE::TAG>(); ++i) {
                    if (IsBufferPosEnd(i)) {
                        break;
                    }
                    int copyHeight = GetCopyHeight(i);
                    auto a1CacheUb = cacheHead2UB_[i * cacheA1Size];
                    calcWidth = CopyNDBlock(a1CacheUb, src, srcOffset, copyHeight, width, gCol, isBankConflict);
                    if (IsTranspose()) {
                        srcOffset += MATMUL_MODULE(CopyCubeInParams)->GetStepCol() *
                                     MATMUL_MODULE(CopyCubeInParams)->template GetBaseHeight();
                    } else {
                        srcOffset += MATMUL_MODULE(CopyCubeInParams)->GetStepCol() *
                                     MATMUL_MODULE(CopyCubeInParams)->template GetBaseHeight() * (int64_t)gCol;
                    }
                }
            } else {
                for (int i = 0; i < GetDepthL1CacheUB<INPUT_TYPE::TAG>(); ++i) {
                    if (IsBufferKPosEnd(i)) {
                        break;
                    }
                    int copyWidth = GetCopyWidth(i, MATMUL_MODULE(CopyCubeInParams)->template GetBaseWidth<IS_TRANS>());
                    auto a1CacheUb = cacheHead2UB_[i * cacheA1Size];
                    calcWidth = CopyNDBlock(a1CacheUb, src, srcOffset, height, copyWidth, gCol, isBankConflict);
                    if (IsTranspose()) {
                        srcOffset += MATMUL_MODULE(CopyCubeInParams)->GetStepRow() *
                                     MATMUL_MODULE(CopyCubeInParams)->template GetBaseHeight<IS_TRANS>() * (int64_t)gCol;
                    } else {
                        srcOffset += MATMUL_MODULE(CopyCubeInParams)->GetStepRow() *
                                     MATMUL_MODULE(CopyCubeInParams)->template GetBaseWidth<IS_TRANS>();
                    }
                }
            }
            cache2UBProc_ = 0;
            auto mte2ToMte1EvtID = GetTPipePtr()->FetchEventID(HardEvent::MTE2_MTE1);
            SetFlag<HardEvent::MTE2_MTE1>(mte2ToMte1EvtID);
            WaitFlag<HardEvent::MTE2_MTE1>(mte2ToMte1EvtID);
        }
        // fetch data from Cache
        uint16_t blockLen = cacheA1Size * sizeof(SRC_T) / ONE_BLK_SIZE;

        auto vToMte1EvtID = GetTPipePtr()->FetchEventID(HardEvent::V_MTE1);
        SetFlag<HardEvent::V_MTE1>(vToMte1EvtID);
        WaitFlag<HardEvent::V_MTE1>(vToMte1EvtID);
        DataCopy(transTensor, cacheHead2UB_[cache2UBProc_ * cacheA1Size], { 1, static_cast<uint16_t>(blockLen), 0, 0 });
        auto mte1ToVEvtID = GetTPipePtr()->FetchEventID(HardEvent::MTE1_V);
        SetFlag<HardEvent::MTE1_V>((event_t)mte1ToVEvtID);
        WaitFlag<HardEvent::MTE1_V>((event_t)mte1ToVEvtID);
        ++cache2UBProc_;
        if (IsL1KFullLoad()) {
            if (IsBufferPosEnd()) {
                cache2UBProc_ = 0;
                qidUBCache_.FreeTensor(cacheHead2UB_);
            }
        } else {
            if (IsBufferKPosEnd()) {
                cache2UBProc_ = 0;
                qidUBCache_.FreeTensor(cacheHead2UB_);
            }
        }
        return calcWidth;
    }

    template <bool IS_TRANS = false>
    __aicore__ inline int32_t GetBNDBlockFromGM(const LocalTensor<SRC_T>& transTensor, const GlobalTensor<SRC_T>& src,
                                                const int row, const int col, const int height, const int width,
                                                const int gCol, const bool isBankConflict)
    {
        auto enQueEvtID = GetTPipePtr()->FetchEventID(HardEvent::MTE1_MTE2);
        SetFlag<HardEvent::MTE1_MTE2>(enQueEvtID);
        WaitFlag<HardEvent::MTE1_MTE2>(enQueEvtID);

        int64_t srcOffset = (int64_t)row * (int64_t)gCol + (int64_t)col;
        uint32_t cacheB1Size =
            MATMUL_MODULE(CopyCubeInParams)->GetStepCol() * MATMUL_MODULE(CopyCubeInParams)->GetStepRow() * MATMUL_MODULE(CopyCubeInParams)->GetBufferSize();

        int calcWidth = CeilT(width, c0Size_);

        if (cache2UBProc_ == 0 || cache2UBProc_ >= GetDepthL1CacheUB<INPUT_TYPE::TAG>()) {
            if (cache2UBProc_ == 0) {
                cacheHead2UB_ = qidUBCache_.template AllocTensor<SRC_T>();
            } else {
                qidUBCache_.FreeTensor(cacheHead2UB_);
                cacheHead2UB_ = qidUBCache_.template AllocTensor<SRC_T>(); // To use que to insert events
            }
            if (IsL1KFullLoad()) {
                for (int i = 0; i < GetDepthL1CacheUB<INPUT_TYPE::TAG>(); ++i) {
                    if (IsBufferPosEnd(i)) {
                        break;
                    }
                    int copyWidth = GetCopyWidth(i, MATMUL_MODULE(CopyCubeInParams)->template GetBaseHeight<IS_TRANS>());
                    auto b1CacheUb = cacheHead2UB_[i * cacheB1Size];
                    calcWidth = CopyNDBlock(b1CacheUb, src, srcOffset, height, copyWidth, gCol, isBankConflict);
                    srcOffset += MATMUL_MODULE(CopyCubeInParams)->GetStepCol() *
                                 MATMUL_MODULE(CopyCubeInParams)->template GetBaseHeight<IS_TRANS>() * (int64_t)gCol;
                }
            } else {
                for (int i = 0; i < GetDepthL1CacheUB<INPUT_TYPE::TAG>(); ++i) {
                    if (IsBufferKPosEnd(i)) {
                        break;
                    }
                    int copyHeight =
                        GetCopyWidth(i, MATMUL_MODULE(CopyCubeInParams)->template GetBaseHeight<IS_TRANS>());
                    auto b1CacheUb = cacheHead2UB_[i * cacheB1Size];
                    calcWidth = CopyNDBlock(b1CacheUb, src, srcOffset, copyHeight, width, gCol, isBankConflict);
                    srcOffset += MATMUL_MODULE(CopyCubeInParams)->GetStepRow() *
                                 MATMUL_MODULE(CopyCubeInParams)->template GetBaseWidth<IS_TRANS>();
                }
            }
            cache2UBProc_ = 0;
            auto mte2ToMte1EvtID = GetTPipePtr()->FetchEventID(HardEvent::MTE2_MTE1);
            SetFlag<HardEvent::MTE2_MTE1>(mte2ToMte1EvtID);
            WaitFlag<HardEvent::MTE2_MTE1>(mte2ToMte1EvtID);
        }
        // fetch data from Cache
        uint16_t blockLen = cacheB1Size * sizeof(SRC_T) / ONE_BLK_SIZE;
        auto vToMte1EvtID = GetTPipePtr()->FetchEventID(HardEvent::V_MTE1);
        SetFlag<HardEvent::V_MTE1>(vToMte1EvtID);
        WaitFlag<HardEvent::V_MTE1>(vToMte1EvtID);
        DataCopy(transTensor, cacheHead2UB_[cache2UBProc_ * cacheB1Size], { 1, static_cast<uint16_t>(blockLen), 0, 0 });
        auto mte1ToVEvtID = GetTPipePtr()->FetchEventID(HardEvent::MTE1_V);
        SetFlag<HardEvent::MTE1_V>((event_t)mte1ToVEvtID);
        WaitFlag<HardEvent::MTE1_V>((event_t)mte1ToVEvtID);
        ++cache2UBProc_;
        if (IsL1KFullLoad()) {
            if (IsBufferPosEnd()) {
                cache2UBProc_ = 0;
                qidUBCache_.FreeTensor(cacheHead2UB_);
            }
        } else {
            if (IsBufferKPosEnd()) {
                cache2UBProc_ = 0;
                qidUBCache_.FreeTensor(cacheHead2UB_);
            }
        }
        return calcWidth;
    }

    // v100, v200
    __aicore__ inline int CopyNDBlock(const LocalTensor<SRC_T>& transTensor, const GlobalTensor<SRC_T>& src,
                                      int64_t srcOffset, const int height, const int width, const int gCol,
                                      const bool isBankConflict)
    {
        ASCENDC_ASSERT((gCol >= width),
                       { KERNEL_LOG(KERNEL_ERROR, "gCol is %d, which should be no less than %d.", gCol, width); });
        int32_t oriC0Size = AuxGetC0Size<SRC_T>();
        int32_t calcWidthExr = CeilAlignT<int32_t>(width, oriC0Size);
        int32_t calcWidth = CeilT<int32_t>(calcWidthExr, c0Size_);

        // gCol unaligned
        if (gCol % oriC0Size) {
            int blockLen = calcWidthExr * sizeof(SRC_T) / DEFAULT_C0_SIZE;
            int dstOffset = 0;
            int BankConflictPadSize = isBankConflict ? (EACH_BLOCK_BYTES_MM_API / sizeof(SRC_T)) : 0;

            // data copy stride is unaligned, need to copy line by line
            for (int i = 0; i < height; i++) {
                DataCopy(transTensor[dstOffset], src[srcOffset], { 1, static_cast<uint16_t>(blockLen), 0, 0 });
                dstOffset += (calcWidthExr + BankConflictPadSize);
                srcOffset += gCol;
            }

            auto enQueEvtID = GetTPipePtr()->FetchEventID(HardEvent::MTE2_V);
            SetFlag<HardEvent::MTE2_V>((event_t)enQueEvtID);
            WaitFlag<HardEvent::MTE2_V>((event_t)enQueEvtID);
        } else {
            int srcStride = (gCol - width) * sizeof(SRC_T) / ONE_BLK_SIZE;
            int blocklen = CeilT<int32_t>(width * sizeof(SRC_T), ONE_BLK_SIZE);
            if (srcStride >= UINT16_MAX) {
                int dstOffset = isBankConflict ? (width + oriC0Size) : width;
                for (int i = 0; i < height; ++i) {
                    DataCopy(transTensor[i * dstOffset], src[srcOffset], { 1, static_cast<uint16_t>(blocklen), 0, 0 });
                    srcOffset += gCol;
                }
            } else {
                uint16_t dstStride = isBankConflict ? 1 : 0;
                int loopNum = CeilT<int32_t>(static_cast<uint16_t>(height), MAX_BLOCK_COUNT_SIZE_MM_API);
                int tailCount = static_cast<uint16_t>(height) % MAX_BLOCK_COUNT_SIZE_MM_API;
                for (int i = 0; i < loopNum; ++i) {
                    uint16_t blockCount = (i == loopNum - 1) ? tailCount : MAX_BLOCK_COUNT_SIZE_MM_API;
                    DataCopy(
                        transTensor[i * MAX_BLOCK_COUNT_SIZE_MM_API * blocklen * ONE_BLK_SIZE / sizeof(SRC_T)],
                        src[srcOffset + i * MAX_BLOCK_COUNT_SIZE_MM_API * blocklen * ONE_BLK_SIZE / sizeof(SRC_T)],
                        { blockCount, static_cast<uint16_t>(blocklen), static_cast<uint16_t>(srcStride), dstStride });
                }
            }
            auto enQueEvtID = GetTPipePtr()->FetchEventID(HardEvent::MTE2_V);
            SetFlag<HardEvent::MTE2_V>((event_t)enQueEvtID);
            WaitFlag<HardEvent::MTE2_V>((event_t)enQueEvtID);
        }
        return calcWidth;
    }

    // v100, v200
    /**
     * @description: Pad zeros for the ND matrix for width.
     * @param: dst: LocalTensor in L1.
     * @param: height: Height of the tile to be loaded.
     * @param: calcWidth: Width of the tile to be calculated.
     * @param: tail: tailSize.
     * @param: offset: offset from head addr to the block.
     * @return: void
     */
    __aicore__ inline void NDPadZeroForWidth(LocalTensor<TRANS_T>& dst,
        const int height, const int calcWidth, const int tail, int offset)
    {
        uint16_t mask_tail_16bit = ~((1 << tail) - 1);
        uint64_t mask_tail_64bit = mask_tail_16bit;
        if (mask_tail_64bit == 0) {
            return;
        }
        uint64_t mask[2];
        mask[0] = mask_tail_64bit + (mask_tail_64bit << FIRST_16BIT_OFFSET_MM_API) +
                    (mask_tail_64bit << SECOND_16BIT_OFFSET_MM_API) + (mask_tail_64bit << THIRD_16BIT_OFFSET_MM_API);
        mask[1] = mask[0];
        int stride = calcWidth * (c0Size_ * sizeof(TRANS_T) / DEFAULT_C0_SIZE);
        int32_t totalRep = CeilT<int32_t>(height, REPEAT_BLOCK_NUM_MM_API);
        if constexpr (IsSameTypeV<TRANS_T, int8_t>) {
            LocalTensor<int16_t> tmpTransTensor = dst.template ReinterpretCast<int16_t>();
            if (stride < EACH_BLOCK_BYTES_MM_API) {
                if (totalRep <= MAX_REPEAT_TIMES) {
                    Duplicate(tmpTransTensor[offset], (int16_t)0, mask,
                        CeilT<int32_t>(height, REPEAT_BLOCK_NUM_MM_API), stride, REPEAT_BLOCK_NUM_MM_API * stride);
                } else {
                    int32_t highBlock = totalRep / MAX_REPEAT_TIMES;
                    int32_t highTail = totalRep % MAX_REPEAT_TIMES;
                    int64_t dstOffset = calcWidth * BLOCK_CUBE * REPEAT_BLOCK_NUM_MM_API * MAX_REPEAT_TIMES;
                    for (int32_t idx = 0; idx < highBlock; ++idx) {
                        Duplicate(tmpTransTensor[offset], (int16_t)0, mask,
                            MAX_REPEAT_TIMES, stride, REPEAT_BLOCK_NUM_MM_API * stride);
                        offset += dstOffset;
                    }
                    if (highTail) {
                        Duplicate(tmpTransTensor[offset], (int16_t)0, mask, highTail,
                            stride, REPEAT_BLOCK_NUM_MM_API * stride);
                    }
                }
            } else {
                for (int32_t i = 0; i < totalRep; ++i) {
                    Duplicate(tmpTransTensor[offset], (int16_t)0, mask, 1, stride, 0);
                    offset += stride * BLOCK_CUBE;
                }
            }
        } else {
            Duplicate(dst[offset], (TRANS_T)0, mask, totalRep, stride, REPEAT_BLOCK_NUM_MM_API * stride);
        }
        PipeBarrier<PIPE_V>();
    }

    // v100, v200
    /**
     * @description: Pad zeros for the ND matrix.
     * @param: dst: LocalTensor in L1.
     * @param: height: Height of the tile to be loaded.
     * @param: calcWidth: Width of the tile to be calculated.
     * @param: gCol: Origin matrix width.
     * @param: width: Width of the tile to be loaded.
     * @param: isBankConflict: The flag of whether is the bank conflict scene.
     * @return: void
     */
    __aicore__ inline void NDPadZeros(LocalTensor<TRANS_T>& dst, const int height, const int calcWidth, const int gCol,
                                      const int width, bool isBankConflict)
    {
        int tail = width % c0Size_;
        if ((gCol % BLOCK_CUBE != 0) && (tail != 0)) {
            // tail pad zero
            constexpr int32_t DIV_TWO = 2;
            auto offset = width / c0Size_ * c0Size_;
            if constexpr (IsSameType<TRANS_T, int8_t>::value) {
                tail = CeilT(tail, DIV_TWO);
                offset /= DIV_TWO;
            }
            NDPadZeroForWidth(dst, height, calcWidth, tail, offset);
        }
        // If the value of high is not an integer multiple of 16, add 0.
        int tailHigh = height % BLOCK_CUBE;
        if (tailHigh) {
            auto dstOffset = height * calcWidth * BLOCK_CUBE;
            if constexpr (IsSameTypeV<TRANS_T, int8_t>) {
                LocalTensor<int16_t> tmpDst = dst.template ReinterpretCast<int16_t>();
                Duplicate(tmpDst[dstOffset], (int16_t)0, (BLOCK_CUBE - tailHigh) * calcWidth * BLOCK_CUBE);
            } else {
                Duplicate(dst[dstOffset], (TRANS_T)0, (BLOCK_CUBE - tailHigh) * calcWidth * BLOCK_CUBE);
            }
        }
    }

    // v100, v200
    /**
     * @description: NDTrans2NZForInt8.
     * @param: dst: LocalTensor in L1.
     * @param: src: GlobalTensor in GM.
     * @param: calcHigh: Height of the tile to be calculated.
     * @param: calcWidth: Width of the tile to be calculated.
     * @param: isBankConflict: The flag of whether is the bank conflict scene.
     * @return: void
     */
    __aicore__ inline void NDTrans2NZForInt8(LocalTensor<TRANS_T>& dst,
        LocalTensor<TRANS_T>& src, const int calcHigh, const int calcWidth, const bool isBankConflict)
    {
        struct UnaryRepeatParams intriParams;
        uint64_t mask[2] = { uint64_t(-1), uint64_t(-1) };
        int blkStride = isBankConflict ? calcWidth + 1 : calcWidth;
        intriParams.dstBlkStride = (c0Size_ * sizeof(TRANS_T) / DEFAULT_C0_SIZE);
        intriParams.srcBlkStride = blkStride * (c0Size_ * sizeof(TRANS_T) / DEFAULT_C0_SIZE);
        intriParams.dstRepStride = intriParams.dstBlkStride * DEFAULT_BLK_NUM;
        intriParams.srcRepStride = intriParams.srcBlkStride * DEFAULT_BLK_NUM;
        int dstOffset = 0;
        int srcOffset = 0;
        // ensure rep stride be less than 256
        constexpr int maxSrcBlkStride = 32;
        LocalTensor<int16_t> tmpSrc = src.template ReinterpretCast<int16_t>();
        LocalTensor<int16_t> tmpDst = dst.template ReinterpretCast<int16_t>();
        if (intriParams.srcBlkStride >= maxSrcBlkStride) {
            intriParams.dstBlkStride = 1;
            intriParams.srcBlkStride = 1;
            mask[0] = (1 << BLOCK_CUBE) - 1;
            mask[1] = 0;
            SetVectorMask<int16_t>(mask[1], mask[0]);
            for (int i = 0; i < calcWidth; i++) {
                for (int j = 0; j < calcHigh * BLOCK_CUBE; ++j) {
                    dstOffset = i * calcHigh * CUBE_MAX_SIZE + j * BLOCK_CUBE;
                    srcOffset = j * blkStride * BLOCK_CUBE + i * BLOCK_CUBE;
                    Muls<int16_t, false>(tmpDst[dstOffset], tmpSrc[srcOffset], (int16_t)1, mask, 1, intriParams);
                }
            }
        } else {
            SetVectorMask<int16_t>(mask[1], mask[0]);
            int32_t totalRepTimes = 2 * calcHigh;
            int32_t highBlock = totalRepTimes / MAX_REPEAT_TIMES;
            int32_t highTail = totalRepTimes % MAX_REPEAT_TIMES;
            for (int i = 0; i < calcWidth; i++) {
                dstOffset = i * calcHigh * CUBE_MAX_SIZE;
                srcOffset = i * BLOCK_CUBE;
                for (int32_t idx = 0; idx < highBlock; ++idx) {
                    Muls<int16_t, false>(tmpDst[dstOffset], tmpSrc[srcOffset], (int16_t)1, mask, MAX_REPEAT_TIMES,
                                            intriParams);
                    dstOffset += BLOCK_CUBE * MAX_REPEAT_TIMES * REPEAT_BLOCK_NUM_MM_API;
                    srcOffset += calcWidth * BLOCK_CUBE * MAX_REPEAT_TIMES * REPEAT_BLOCK_NUM_MM_API;
                }
                if (highTail) {
                    Muls<int16_t, false>(tmpDst[dstOffset], tmpSrc[srcOffset], (int16_t)1, mask, highTail,
                                            intriParams);
                }
            }
        }
    }

    // v100, v200
    /**
     * @description: NDTrans2NZForFP16.
     * @param: dst: LocalTensor in L1.
     * @param: src: GlobalTensor in GM.
     * @param: calcHigh: Height of the tile to be calculated.
     * @param: calcWidth: Width of the tile to be calculated.
     * @param: isBankConflict: The flag of whether is the bank conflict scene.
     * @return: void
     */
    __aicore__ inline void NDTrans2NZForFP16(LocalTensor<TRANS_T>& dst,
        LocalTensor<TRANS_T>& src, const int calcHigh, const int calcWidth, const bool isBankConflict)
    {
        const int c0Count = AscendCUtils::GetC0Count(sizeof(TRANS_T));
        struct UnaryRepeatParams intriParams;
        uint64_t mask[2] = { uint64_t(-1), uint64_t(-1) };
        int32_t padBlock = 1;
        constexpr int32_t BLOCK_NUM = 2;
        if constexpr (IsSameTypeV<TRANS_T, half> && IsSameTypeV<SRC_T, int8_t>) {
            padBlock = BLOCK_NUM;
        }
        int blkStride = isBankConflict ? calcWidth + padBlock : calcWidth;
        intriParams.dstBlkStride = (BLOCK_CUBE * sizeof(TRANS_T) / DEFAULT_C0_SIZE);
        intriParams.srcBlkStride = blkStride * BLOCK_CUBE * sizeof(TRANS_T) / DEFAULT_C0_SIZE;
        intriParams.dstRepStride = intriParams.dstBlkStride * DEFAULT_BLK_NUM;
        intriParams.srcRepStride = intriParams.srcBlkStride * DEFAULT_BLK_NUM;
        int dstOffset = 0;
        int srcOffset = 0;
        // ensure rep stride be less than 256
        constexpr int maxSrcBlkStride = 32;
        if (intriParams.srcBlkStride >= maxSrcBlkStride) {
            intriParams.dstBlkStride = 1;
            intriParams.srcBlkStride = 1;
            mask[0] = (1 << BLOCK_CUBE) - 1;
            mask[1] = 0;
            SetVectorMask<TRANS_T>(mask[1], mask[0]);
            for (int i = 0; i < calcWidth; i++) {
                for (int j = 0; j < calcHigh * BLOCK_CUBE; ++j) {
                    dstOffset = i * calcHigh * CUBE_MAX_SIZE + j * BLOCK_CUBE;
                    srcOffset = j * blkStride * BLOCK_CUBE + i * BLOCK_CUBE;
                    Muls<TRANS_T, false>(dst[dstOffset], src[srcOffset], (TRANS_T)1, mask, 1, intriParams);
                    if constexpr (sizeof(TRANS_T) == sizeof(float)) {
                        Muls<TRANS_T, false>(dst[dstOffset + c0Count], src[srcOffset + c0Count], (TRANS_T)1, mask,
                                                1, intriParams);
                    }
                }
            }
        } else {
            SetVectorMask<TRANS_T>(mask[1], mask[0]);
            for (int i = 0; i < calcWidth; i++) {
                dstOffset = i * calcHigh * CUBE_MAX_SIZE;
                srcOffset = i * BLOCK_CUBE;
                Muls<TRANS_T, false>(dst[dstOffset], src[srcOffset], (TRANS_T)1, mask, BLOCK_NUM * calcHigh, intriParams);
                if constexpr (sizeof(TRANS_T) == sizeof(float)) {
                    Muls<TRANS_T, false>(dst[dstOffset + c0Count], src[srcOffset + c0Count], (TRANS_T)1, mask,
                                            BLOCK_NUM * calcHigh, intriParams);
                }
            }
        }
    }

    // v100, v200
    /**
     * @description: NDTrans2NZ.
     * @param: dst: LocalTensor in L1.
     * @param: src: GlobalTensor in GM.
     * @param: calcHigh: Height of the tile to be calculated.
     * @param: calcWidth: Width of the tile to be calculated.
     * @param: isBankConflict: The flag of whether is the bank conflict scene.
     * @return: void
     */
    __aicore__ inline void NDTrans2NZ(LocalTensor<TRANS_T>& dst, LocalTensor<TRANS_T>& src, const int calcHigh,
                                      const int calcWidth, const bool isBankConflict)
    {
        // Use Muls, convert to NZ format
        if constexpr (IsSameTypeV<TRANS_T, int8_t>) {
            NDTrans2NZForInt8(dst, src, calcHigh, calcWidth, isBankConflict);
        } else {
            NDTrans2NZForFP16(dst, src, calcHigh, calcWidth, isBankConflict);
        }
    }

    // v100, v200
    __aicore__ inline void CopyND2NZOnTheFly(const LocalTensor<TRANS_T>& dst, GlobalTensor<SRC_T>& src, const int row,
                                             const int col, const int height, const int width, const int gCol)
    {
        ASSERT(gCol >= width && "Copy ND block gm->ub width larger than origin matrix width.");
        int calcWidth = width / c0Size_; // cube block numbers that do not need to be pad zero
        int tail = width % c0Size_;
        int dstOffset = 0;
        int64_t srcOffset = ((int64_t)row * (int64_t)gCol + (int64_t)col);
        int calcWidthExr = CeilT<int32_t>(width, c0Size_);
        int calcHeightExr = CeilT<int32_t>(height, BLOCK_CUBE);

#if __CCE_AICORE__ == 200
        // set2d, pad tail zero
        if (height % BLOCK_CUBE != 0) {
            int64_t repeat = calcWidthExr * calcHeightExr;
            InitConstValueParams<TRANS_T> initConstValueParams;
            initConstValueParams.repeatTimes = (uint16_t)repeat;
            initConstValueParams.initValue = 0;
            InitConstValue(dst, initConstValueParams);
            PipeBarrier<PIPE_MTE2>();
        }
#endif

        // gCol unaligned, can not use dma copy repeat stride
        if (tail != 0) {
            // tail elements that need to be pad zero
            int blockLen = calcWidthExr * (c0Size_ * sizeof(TRANS_T) / DEFAULT_C0_SIZE);

            // gm->l1
            int srcGap = gCol * sizeof(TRANS_T) / ONE_BLK_SIZE - 1;
            if (gCol % c0Size_ || srcGap >= UINT16_MAX) {
                // each block len is only 32B
                for (int i = 0; i < calcWidth; i++) {
                    for (int j = 0; j < height; j++) {
                        DataCopy(dst[dstOffset + i * calcHeightExr * BLOCK_CUBE * c0Size_ + j * c0Size_],
                                 src[srcOffset + j * gCol + i * c0Size_], { 1, 1, 0, 0 });
                    }
                }
            } else {
                // data copy stride is aligned
                for (int i = 0; i < calcWidth; i++) {
                    DataCopy(dst[dstOffset], src[srcOffset],
                             { static_cast<uint16_t>(height), 1, static_cast<uint16_t>(srcGap), 0 });
                    dstOffset += calcHeightExr * BLOCK_CUBE * c0Size_;
                    srcOffset += c0Size_;
                }
            }

            LocalTensor<TRANS_T> trans;
            // tail gm->ub pad zero, and then ub->l1

            trans = MATMUL_MODULE(LocalWorkspace)->GetND2NZWorkspace(0)
                        .template ReinterpretCast<TRANS_T>();
            int tranSize = width * EACH_BLOCK_BYTES_MM_API / sizeof(TRANS_T);
            trans.SetSize(tranSize);

            int64_t tailSrcoffset = (int64_t)row * (int64_t)gCol + (int64_t)col + (int64_t)calcWidth * (int64_t)c0Size_;

            // gm->ub
            for (int i = 0; i < height; i++) {
                DataCopy(trans[i * c0Size_], src[tailSrcoffset], { 1, 1, 0, 0 });
                tailSrcoffset += gCol;
            }

            event_t eventIDMte2ToV = static_cast<event_t>(GetTPipePtr()->FetchEventID(HardEvent::MTE2_V));
            SetFlag<HardEvent::MTE2_V>(eventIDMte2ToV);
            WaitFlag<HardEvent::MTE2_V>(eventIDMte2ToV);

            // tail pad zero
            uint64_t mask[2];
            constexpr int32_t DIV_TWO = 2;
            if constexpr (IsSameTypeV<TRANS_T, int8_t>) {
                tail = CeilT<int32_t>(tail, DIV_TWO);
            }
            uint16_t mask_tail_16bit = ~((1 << tail) - 1);
            uint64_t mask_tail_64bit = mask_tail_16bit;
            mask[0] = mask_tail_64bit + (mask_tail_64bit << FIRST_16BIT_OFFSET_MM_API) +
                (mask_tail_64bit << SECOND_16BIT_OFFSET_MM_API) + (mask_tail_64bit << THIRD_16BIT_OFFSET_MM_API);
            mask[1] = mask[0];
            constexpr int32_t DUP_CEIL_NUM = 8;
            if constexpr (IsSameTypeV<TRANS_T, int8_t>) {
                LocalTensor<int16_t> tmpTrans = trans.template ReinterpretCast<int16_t>();
                Duplicate(tmpTrans, (int16_t)0, mask, CeilT<int32_t>(height, DUP_CEIL_NUM), 1, DUP_CEIL_NUM);
            } else {
                Duplicate(trans, (TRANS_T)0, mask, CeilT<int32_t>(height, DUP_CEIL_NUM), 1, DUP_CEIL_NUM);
            }

            event_t eventIDVToMte3 = static_cast<event_t>(GetTPipePtr()->FetchEventID(HardEvent::V_MTE3));
            SetFlag<HardEvent::V_MTE3>(eventIDVToMte3);
            WaitFlag<HardEvent::V_MTE3>(eventIDVToMte3);

            // ub->l1
            int heightAlignBlock = CeilT<int32_t>(height, BLOCK_CUBE);
            int tailDstOffset = heightAlignBlock * BLOCK_CUBE * c0Size_ * calcWidth;
            DataCopy(dst[tailDstOffset], trans, { static_cast<uint16_t>(height), 1, 0, 0 });
        } else {
            int srcGap = gCol * sizeof(TRANS_T) / ONE_BLK_SIZE - 1;
            if (gCol % c0Size_ != 0 || srcGap >= UINT16_MAX) {
                int64_t oriSrcOffset = srcOffset;
                int oriDstOffset = dstOffset;
                // each block len is only 32B
                for (int i = 0; i < calcWidth; i++) {
                    for (int j = 0; j < height; j++) {
                        DataCopy(dst[dstOffset], src[srcOffset], { 1, 1, 0, 0 });
                        dstOffset += c0Size_;
                        srcOffset += gCol;
                    }
                    srcOffset = oriSrcOffset + (i + 1) * c0Size_;
                    dstOffset = oriDstOffset + (i + 1) * calcHeightExr * BLOCK_CUBE * c0Size_;
                }
            } else {
                // data copy stride is aligned
                for (int i = 0; i < calcWidth; i++) {
                    DataCopy(dst[dstOffset], src[srcOffset],
                             { static_cast<uint16_t>(height), 1, static_cast<uint16_t>(srcGap), 0 });
                    dstOffset += calcHeightExr * BLOCK_CUBE * c0Size_;
                    srcOffset += c0Size_;
                }
            }
            event_t eventIDMte2ToMte1 = static_cast<event_t>(GetTPipePtr()->FetchEventID(HardEvent::MTE2_MTE1));
            SetFlag<HardEvent::MTE2_MTE1>(eventIDMte2ToMte1);
            WaitFlag<HardEvent::MTE2_MTE1>(eventIDMte2ToMte1);
        }
    }

    // v100, v200
    __aicore__ inline void CopyND2NZOnTheFly(const LocalTensor<TRANS_T>& dst, LocalTensor<SRC_T>& src, const int row,
                                             const int col, const int height, const int width, const int gCol)
    {
        ASSERT(gCol >= width && "Copy ND block ub->ub width larger than origin matrix width.");
        int calcWidth = width / c0Size_; // cube block numbers that do not need to be pad zero
        int tail = width % c0Size_;
        int dstOffset = 0;
        int srcOffset = (row * gCol + col);
        int calcWidthExr = CeilT<int32_t>(width, c0Size_);
        int calcHeightExr = CeilT<int32_t>(height, BLOCK_CUBE);

#if __CCE_AICORE__ == 200
        // set2d, pad tail zero
        if (height % BLOCK_CUBE != 0) {
            int64_t repeat = calcWidthExr * calcHeightExr;
            InitConstValueParams<TRANS_T> initConstValueParams;
            initConstValueParams.repeatTimes = (uint16_t)repeat;
            initConstValueParams.initValue = 0;
            InitConstValue(dst, initConstValueParams);

            event_t eventIDMte2ToMte3 = static_cast<event_t>(GetTPipePtr()->FetchEventID(HardEvent::MTE2_MTE3));
            SetFlag<HardEvent::MTE2_MTE3>(eventIDMte2ToMte3);
            WaitFlag<HardEvent::MTE2_MTE3>(eventIDMte2ToMte3);
        }
#endif

        DataCopyEnhancedParams enhancedParams;
        enhancedParams.blockMode = BlockMode::BLOCK_MODE_VECTOR;

        // gCol unaligned, can not use dma copy repeat stride
        if (tail != 0) {
            // tail elements that need to be pad zero
            int blockLen = calcWidthExr * (c0Size_ * sizeof(TRANS_T) / DEFAULT_C0_SIZE);

            // ub->l1
            int srcGap = gCol * sizeof(TRANS_T) / ONE_BLK_SIZE - 1;
            if (gCol % c0Size_ || srcGap >= UINT16_MAX) {
                // each block len is only 32B
                for (int i = 0; i < calcWidth; i++) {
                    for (int j = 0; j < height; j++) {
                        DataCopy(dst[dstOffset + i * calcHeightExr * BLOCK_CUBE * c0Size_ + j * c0Size_],
                                 src[srcOffset + j * gCol + i * c0Size_], { 1, 1, 0, 0 }, enhancedParams);
                    }
                }
            } else {
                // data copy stride is aligned
                for (int i = 0; i < calcWidth; i++) {
                    DataCopy(dst[dstOffset], src[srcOffset],
                             { static_cast<uint16_t>(height), 1, static_cast<uint16_t>(srcGap), 0 }, enhancedParams);
                    dstOffset += calcHeightExr * BLOCK_CUBE * c0Size_;
                    srcOffset += c0Size_;
                }
            }

            LocalTensor<TRANS_T> trans;
            // tail gm->ub pad zero, and then ub->l1
            trans = MATMUL_MODULE(LocalWorkspace)->GetND2NZWorkspace(0)
                        .template ReinterpretCast<TRANS_T>();
            int tranSize = width * EACH_BLOCK_BYTES_MM_API / sizeof(SRC_T);
            trans.SetSize(tranSize);

            int tailSrcoffset = row * gCol + col + calcWidth * c0Size_;
            // ub->ub
            for (int i = 0; i < height; i++) {
                DataCopy(trans[i * c0Size_], src[tailSrcoffset], { 1, 1, 0, 0 }, enhancedParams);
                tailSrcoffset += gCol;
            }

            event_t eventIDMte2ToV = static_cast<event_t>(GetTPipePtr()->FetchEventID(HardEvent::MTE2_V));
            SetFlag<HardEvent::MTE2_V>(eventIDMte2ToV);
            WaitFlag<HardEvent::MTE2_V>(eventIDMte2ToV);

            // tail pad zero
            uint64_t mask[2];
            uint16_t mask_tail_16bit = ~((1 << tail) - 1);
            uint64_t mask_tail_64bit = mask_tail_16bit;
            mask[0] = mask_tail_64bit + (mask_tail_64bit << FIRST_16BIT_OFFSET_MM_API) +
                (mask_tail_64bit << SECOND_16BIT_OFFSET_MM_API) + (mask_tail_64bit << THIRD_16BIT_OFFSET_MM_API);
            mask[1] = mask[0];
            constexpr int32_t DUP_CEIL_NUM = 8;
            if constexpr (IsSameTypeV<TRANS_T, int8_t>) {
                LocalTensor<int16_t> tmpTrans = trans.template ReinterpretCast<int16_t>();
                Duplicate(tmpTrans, (int16_t)0, mask, static_cast<uint8_t>(CeilT<int32_t>(height, DUP_CEIL_NUM)), 1, DUP_CEIL_NUM);
            } else {
                Duplicate(trans, (TRANS_T)0, mask, static_cast<uint8_t>(CeilT<int32_t>(height, DUP_CEIL_NUM)), 1, DUP_CEIL_NUM);
            }

            event_t eventIDVToMte3 = static_cast<event_t>(GetTPipePtr()->FetchEventID(HardEvent::V_MTE3));
            SetFlag<HardEvent::V_MTE3>(eventIDVToMte3);
            WaitFlag<HardEvent::V_MTE3>(eventIDVToMte3);

            // ub->l1
            int heightAlignBlock = CeilT<int32_t>(height, BLOCK_CUBE);
            int tailDstOffset = heightAlignBlock * BLOCK_CUBE * c0Size_ * calcWidth;
            DataCopy(dst[tailDstOffset], trans, { static_cast<uint16_t>(height), 1, 0, 0 }, enhancedParams);
        } else {
            int srcGap = gCol * sizeof(TRANS_T) / ONE_BLK_SIZE - 1;
            if (gCol % c0Size_ || srcGap >= UINT16_MAX) {
                int oriSrcOffset = srcOffset;
                int oriDstOffset = dstOffset;
                // each block len is only 32B
                for (int i = 0; i < calcWidth; i++) {
                    for (int j = 0; j < height; j++) {
                        DataCopy(dst[dstOffset], src[srcOffset], { 1, 1, 0, 0 }, enhancedParams);
                        dstOffset += c0Size_;
                        srcOffset += gCol;
                    }
                    srcOffset = oriSrcOffset + (i + 1) * c0Size_;
                    dstOffset = oriDstOffset + (i + 1) * calcHeightExr * BLOCK_CUBE * c0Size_;
                }
            } else {
                // data copy stride is aligned
                for (int i = 0; i < calcWidth; i++) {
                    DataCopy(dst[dstOffset], src[srcOffset],
                             { static_cast<uint16_t>(height), 1, static_cast<uint16_t>(srcGap), 0 }, enhancedParams);
                    dstOffset += calcHeightExr * BLOCK_CUBE * c0Size_;
                    srcOffset += c0Size_;
                }
            }
        }
    }

    template <bool IS_TRANS = false>
    __aicore__ inline void CopyND2NZWithTransData(const LocalTensor<TRANS_T>& dst, LocalTensor<SRC_T>& src,
                                                  const int row, const int col, const int tileHeight,
                                                  const int tileWidth)
    {
        int calcWidth = CeilT(tileWidth, c0Size_) * c0Size_;
        int calcHigh = CeilT(tileHeight, c0Size_) * c0Size_;
        int64_t size = calcHigh * calcWidth;
        LocalTensor<TRANS_T> rightMatrix = MATMUL_MODULE(LocalWorkspace)->GetND2NZWorkspace(0)
                                               .template ReinterpretCast<TRANS_T>();
        rightMatrix.SetSize(size);
        int srcOffset = row * MATMUL_MODULE(CopyCubeInParams)->template GetBaseHeight<IS_TRANS>() *
                            MATMUL_MODULE(CopyCubeInParams)->template GetOrgWidth<IS_TRANS>() +
                        col * MATMUL_MODULE(CopyCubeInParams)->template GetBaseWidth<IS_TRANS>();
        int dstOffset = 0;
        for (int i = 0; i < tileHeight; i++) {
            DataCopy(rightMatrix[dstOffset], src[srcOffset], calcWidth);
            srcOffset += MATMUL_MODULE(CopyCubeInParams)->template GetOrgWidth<IS_TRANS>();
            dstOffset += calcWidth;
        }
        LocalTensor<TRANS_T> trans = MATMUL_MODULE(LocalWorkspace)->GetND2NZWorkspace(size)
                                         .template ReinterpretCast<TRANS_T>();
        trans.SetSize(size);
        event_t eventIDMte3ToV = static_cast<event_t>(GetTPipePtr()->FetchEventID(HardEvent::MTE3_V));
        SetFlag<HardEvent::MTE3_V>(eventIDMte3ToV);
        WaitFlag<HardEvent::MTE3_V>(eventIDMte3ToV);
        TransDataNDBMatrix(trans, rightMatrix, tileHeight, tileWidth);
        event_t eventIDVToMte3 = static_cast<event_t>(GetTPipePtr()->FetchEventID(HardEvent::V_MTE3));
        SetFlag<HardEvent::V_MTE3>(eventIDVToMte3);
        WaitFlag<HardEvent::V_MTE3>(eventIDVToMte3);
        CopyNZ2NZ(dst, trans, 0, 0, calcWidth, calcHigh, calcWidth);
    }

    template <bool IS_TRANS = false>
    __aicore__ inline void CopyND2NZWithTransData(const LocalTensor<TRANS_T>& dst, GlobalTensor<SRC_T>& src,
                                                  const int row, const int col, const int tileHeight,
                                                  const int tileWidth)
    {
        int calcWidth = CeilT(tileWidth, c0Size_) * c0Size_;
        int calcHigh = CeilT(tileHeight, c0Size_) * c0Size_;
        int64_t size = calcHigh * calcWidth;
        LocalTensor<TRANS_T> rightMatrix = MATMUL_MODULE(LocalWorkspace)->GetND2NZWorkspace(0)
                                               .template ReinterpretCast<TRANS_T>();
        rightMatrix.SetSize(size);
        int srcOffset = row * MATMUL_MODULE(CopyCubeInParams)->template GetBaseHeight<IS_TRANS>() *
                            MATMUL_MODULE(CopyCubeInParams)->template GetOrgWidth<IS_TRANS>() +
                        col * MATMUL_MODULE(CopyCubeInParams)->template GetBaseWidth<IS_TRANS>();
        int dstOffset = 0;
        for (int i = 0; i < tileHeight; i++) {
            DataCopy(rightMatrix[dstOffset], src[srcOffset], calcWidth);
            srcOffset += MATMUL_MODULE(CopyCubeInParams)->template GetOrgWidth<IS_TRANS>();
            dstOffset += calcWidth;
        }
        LocalTensor<TRANS_T> trans = MATMUL_MODULE(LocalWorkspace)->GetND2NZWorkspace(size)
                                         .template ReinterpretCast<TRANS_T>();
        trans.SetSize(size);
        event_t eventIDMte2ToV = static_cast<event_t>(GetTPipePtr()->FetchEventID(HardEvent::MTE2_V));
        SetFlag<HardEvent::MTE2_V>(eventIDMte2ToV);
        WaitFlag<HardEvent::MTE2_V>(eventIDMte2ToV);
        TransDataNDBMatrix(trans, rightMatrix, tileHeight, tileWidth);
        event_t eventIDVToMte3 = static_cast<event_t>(GetTPipePtr()->FetchEventID(HardEvent::V_MTE3));
        SetFlag<HardEvent::V_MTE3>(eventIDVToMte3);
        WaitFlag<HardEvent::V_MTE3>(eventIDVToMte3);
        CopyNZ2NZ(dst, trans, 0, 0, calcWidth, calcHigh, calcWidth);
    }

    template <bool IS_TRANS = false>
    __aicore__ inline void CopyNZ2NZWithTransData(const LocalTensor<TRANS_T>& dst, LocalTensor<SRC_T>& src,
                                                  const int row, const int col, const int tileHeight,
                                                  const int tileWidth)
    {
        int64_t size = tileHeight * tileWidth;
        LocalTensor<TRANS_T> trans = MATMUL_MODULE(LocalWorkspace)->GetND2NZWorkspace(size)
                                        .template ReinterpretCast<TRANS_T>();
        trans.SetSize(size);
        int srcOffset = row * MATMUL_MODULE(CopyCubeInParams)->template GetBaseHeight<IS_TRANS>() * c0Size_ +
                        col * MATMUL_MODULE(CopyCubeInParams)->template GetBaseWidth<IS_TRANS>() *
                            MATMUL_MODULE(CopyCubeInParams)->template GetOrgHeight<IS_TRANS>();
        TransDataNZBMatrix<IS_TRANS>(trans, src[srcOffset], tileHeight, tileWidth);
        event_t eventIDVToMte3 = static_cast<event_t>(GetTPipePtr()->FetchEventID(HardEvent::V_MTE3));
        SetFlag<HardEvent::V_MTE3>(eventIDVToMte3);
        WaitFlag<HardEvent::V_MTE3>(eventIDVToMte3);
        CopyNZ2NZ(dst, trans, 0, 0, tileWidth, tileHeight, tileWidth);
    }

    template <bool IS_TRANS = false>
    __aicore__ inline void CopyNZ2NZWithTransData(const LocalTensor<TRANS_T>& dst, GlobalTensor<SRC_T>& src,
                                                  const int row, const int col, const int tileHeight,
                                                  const int tileWidth)
    {
        int calcWidth = CeilT(tileWidth, c0Size_) * c0Size_;
        int calcHigh = CeilT(tileHeight, c0Size_) * c0Size_;
        int64_t size = calcHigh * calcWidth;
        LocalTensor<TRANS_T> rightMatrix = MATMUL_MODULE(LocalWorkspace)->GetND2NZWorkspace(0)
                                               .template ReinterpretCast<TRANS_T>();
        rightMatrix.SetSize(size);
        int srcOffset = row * MATMUL_MODULE(CopyCubeInParams)->template GetBaseHeight<IS_TRANS>() * c0Size_ +
                        col * MATMUL_MODULE(CopyCubeInParams)->template GetBaseWidth<IS_TRANS>() *
                            MATMUL_MODULE(CopyCubeInParams)->template GetOrgHeight<IS_TRANS>();
        int dstOffset = 0;
        int srcHigh = CeilT(MATMUL_MODULE(CopyCubeInParams)->template GetOrgHeight<IS_TRANS>(), 16) * 16 * c0Size_;
        int dstHigh = tileHeight < c0Size_ ? tileHeight * c0Size_ : calcHigh * c0Size_;
        for (int i = 0; i < CeilT(tileWidth, c0Size_); i++) {
            DataCopy(rightMatrix[dstOffset], src[srcOffset], dstHigh);
            srcOffset += srcHigh;
            dstOffset += dstHigh;
        }
        LocalTensor<TRANS_T> trans = MATMUL_MODULE(LocalWorkspace)->GetND2NZWorkspace(size)
                                         .template ReinterpretCast<TRANS_T>();
        trans.SetSize(size);
        event_t eventIDMte2ToV = static_cast<event_t>(GetTPipePtr()->FetchEventID(HardEvent::MTE2_V));
        SetFlag<HardEvent::MTE2_V>(eventIDMte2ToV);
        WaitFlag<HardEvent::MTE2_V>(eventIDMte2ToV);
        TransDataNZBMatrix<IS_TRANS>(trans, rightMatrix, tileHeight, tileWidth);
        event_t eventIDVToMte3 = static_cast<event_t>(GetTPipePtr()->FetchEventID(HardEvent::V_MTE3));
        SetFlag<HardEvent::V_MTE3>(eventIDVToMte3);
        WaitFlag<HardEvent::V_MTE3>(eventIDVToMte3);
        CopyNZ2NZ(dst, trans, 0, 0, calcWidth, calcHigh, calcWidth);
    }

    __aicore__ inline void TransDataNDBMatrix(const LocalTensor<TRANS_T>& dst, const LocalTensor<SRC_T>& src,
                                              int height, int width)
    {
        int iterK = CeilT(height, c0Size_);
        int iterN = CeilT(width, c0Size_);
        int calcWidth = iterN * c0Size_;
        int tailWidth = (width % c0Size_) > TRANS_DATA_ARRAY_SIZE_MM_API ? 0 : width % TRANS_DATA_ARRAY_SIZE_MM_API;
        TransDataTo5HDParams params;
        params.repeatTimes = iterK;
        params.dstRepStride = iterK == 1 ? 0 : calcWidth;
        params.srcRepStride = iterK == 1 ? 0 : calcWidth;
        int dstHighHalfOffset = TRANS_DATA_ARRAY_SIZE_MM_API * c0Size_;
        int srcHighHalfOffset = TRANS_DATA_ARRAY_SIZE_MM_API * calcWidth;
        iterN = tailWidth ? iterN - 1 : iterN;
        uint64_t dstLocalList[TRANS_DATA_ARRAY_SIZE_MM_API];
        uint64_t srcLocalList[TRANS_DATA_ARRAY_SIZE_MM_API];
        int dstOffset = 0;
        int srcOffset = 0;
        for (int curN = 0; curN < iterN; curN++) {
            int dstListOffset = 0;
            int srcListOffset = 0;
            for (int i = 0; i < TRANS_DATA_ARRAY_SIZE_MM_API; i++) {
                dstLocalList[i] = (uint64_t)(dst[dstOffset + dstListOffset].GetPhyAddr());
                srcLocalList[i] = (uint64_t)(src[srcOffset + srcListOffset].GetPhyAddr());
                dstListOffset += c0Size_;
                srcListOffset += calcWidth;
            }
            params.dstHighHalf = false;
            params.srcHighHalf = false;
            TransDataTo5HD<TRANS_T>(dstLocalList, srcLocalList, params);
            PipeBarrier<PIPE_V>();
            srcListOffset = 0;
            for (int i = 0; i < TRANS_DATA_ARRAY_SIZE_MM_API; i++) {
                srcLocalList[i] = (uint64_t)(src[srcOffset + srcListOffset + srcHighHalfOffset].GetPhyAddr());
                srcListOffset += calcWidth;
            }
            params.dstHighHalf = true;
            params.srcHighHalf = false;
            TransDataTo5HD<TRANS_T>(dstLocalList, srcLocalList, params);
            PipeBarrier<PIPE_V>();
            dstListOffset = 0;
            srcListOffset = 0;
            for (int i = 0; i < TRANS_DATA_ARRAY_SIZE_MM_API; i++) {
                dstLocalList[i] = (uint64_t)(dst[dstOffset + dstListOffset + dstHighHalfOffset].GetPhyAddr());
                srcLocalList[i] = (uint64_t)(src[srcOffset + srcListOffset].GetPhyAddr());
                dstListOffset += c0Size_;
                srcListOffset += calcWidth;
            }
            params.dstHighHalf = false;
            params.srcHighHalf = true;
            TransDataTo5HD<TRANS_T>(dstLocalList, srcLocalList, params);
            PipeBarrier<PIPE_V>();
            srcListOffset = 0;
            for (int i = 0; i < TRANS_DATA_ARRAY_SIZE_MM_API; i++) {
                srcLocalList[i] = (uint64_t)(src[srcOffset + srcListOffset + srcHighHalfOffset].GetPhyAddr());
                srcListOffset += calcWidth;
            }
            params.dstHighHalf = true;
            params.srcHighHalf = true;
            TransDataTo5HD<TRANS_T>(dstLocalList, srcLocalList, params);
            PipeBarrier<PIPE_V>();
            dstOffset += c0Size_ * c0Size_;
            srcOffset += c0Size_;
        }
        if (tailWidth) {
            dstOffset = iterN * c0Size_ * c0Size_;
            srcOffset = iterN * c0Size_;
            int dstListOffset = 0;
            int srcListOffset = 0;
            params.dstRepStride = iterK == 1 ? 0 : TRANS_DATA_ARRAY_SIZE_MM_API;
            for (int i = 0; i < TRANS_DATA_ARRAY_SIZE_MM_API; i++) {
                dstLocalList[i] = (uint64_t)(dst[dstOffset + dstListOffset].GetPhyAddr());
                srcLocalList[i] = (uint64_t)(src[srcOffset + srcListOffset].GetPhyAddr());
                dstListOffset += c0Size_;
                srcListOffset += calcWidth;
            }
            params.dstHighHalf = false;
            params.srcHighHalf = false;
            TransDataTo5HD<TRANS_T>(dstLocalList, srcLocalList, params);
            PipeBarrier<PIPE_V>();
            srcListOffset = 0;
            for (int i = 0; i < TRANS_DATA_ARRAY_SIZE_MM_API; i++) {
                srcLocalList[i] = (uint64_t)(src[srcOffset + srcListOffset + srcHighHalfOffset].GetPhyAddr());
                srcListOffset += calcWidth;
            }
            params.dstHighHalf = true;
            params.srcHighHalf = false;
            TransDataTo5HD<TRANS_T>(dstLocalList, srcLocalList, params);
            PipeBarrier<PIPE_V>();
        }
    }

    template <bool IS_TRANS = false>
    __aicore__ inline void TransDataNZBMatrix(const LocalTensor<TRANS_T>& dst, const LocalTensor<SRC_T>& src,
                                              int height, int width)
    {
        int iterK = CeilT(height, c0Size_);
        int iterN = CeilT(width, c0Size_);
        int calcWidth = iterN * c0Size_;
        int tailWidth = width % c0Size_;
        TransDataTo5HDParams params;
        params.repeatTimes = iterK;
        params.dstRepStride = iterK == 1 ? 0 : calcWidth;
        params.srcRepStride = iterK == 1 ? 0 : c0Size_;
        int dstHighHalfOffset = TRANS_DATA_ARRAY_SIZE_MM_API * c0Size_;
        int srcHighHalfOffset = TRANS_DATA_ARRAY_SIZE_MM_API * c0Size_;
        uint64_t dstLocalList[TRANS_DATA_ARRAY_SIZE_MM_API];
        uint64_t srcLocalList[TRANS_DATA_ARRAY_SIZE_MM_API];
        int dstOffset = 0;
        int srcOffset = 0;
        for (int curN = 0; curN < iterN; curN++) {
            params.dstRepStride =
                (curN == iterN - 1 && tailWidth > 0 && tailWidth < c0Size_) ? tailWidth : params.dstRepStride;
            int dstListOffset = 0;
            int srcListOffset = 0;
            for (int i = 0; i < TRANS_DATA_ARRAY_SIZE_MM_API; i++) {
                dstLocalList[i] = (uint64_t)(dst[dstOffset + dstListOffset + dstHighHalfOffset].GetPhyAddr());
                srcLocalList[i] = (uint64_t)(src[srcOffset + srcListOffset + srcHighHalfOffset].GetPhyAddr());
                dstListOffset += c0Size_;
                srcListOffset += c0Size_;
            }
            params.dstHighHalf = true;
            params.srcHighHalf = true;
            TransDataTo5HD<TRANS_T>(dstLocalList, srcLocalList, params);
            PipeBarrier<PIPE_V>();
            srcListOffset = 0;
            for (int i = 0; i < TRANS_DATA_ARRAY_SIZE_MM_API; i++) {
                srcLocalList[i] = (uint64_t)(src[srcOffset + srcListOffset].GetPhyAddr());
                srcListOffset += c0Size_;
            }
            params.dstHighHalf = false;
            params.srcHighHalf = true;
            TransDataTo5HD<TRANS_T>(dstLocalList, srcLocalList, params);
            PipeBarrier<PIPE_V>();
            dstListOffset = 0;
            for (int i = 0; i < TRANS_DATA_ARRAY_SIZE_MM_API; i++) {
                dstLocalList[i] = (uint64_t)(dst[dstOffset + dstListOffset].GetPhyAddr());
                dstListOffset += c0Size_;
            }
            params.dstHighHalf = false;
            params.srcHighHalf = false;
            TransDataTo5HD<TRANS_T>(dstLocalList, srcLocalList, params);
            PipeBarrier<PIPE_V>();
            srcListOffset = 0;
            for (int i = 0; i < TRANS_DATA_ARRAY_SIZE_MM_API; i++) {
                srcLocalList[i] = (uint64_t)(src[srcOffset + srcListOffset + srcHighHalfOffset].GetPhyAddr());
                srcListOffset += c0Size_;
            }
            params.dstHighHalf = true;
            params.srcHighHalf = false;
            TransDataTo5HD<TRANS_T>(dstLocalList, srcLocalList, params);
            PipeBarrier<PIPE_V>();
            dstOffset += c0Size_ * c0Size_;
            srcOffset += height * c0Size_;
        }
    }

    __aicore__ inline void AntiQuantCompute(const LocalTensor<TRANS_T>& quantOut, const LocalTensor<SRC_T>& quantIn,
                                            bool isBankConflict)
    {
#if __CCE_AICORE__ == 200
        LocalTensor<uint8_t> sharedLocal = GetSharedLocal();
        if constexpr (ToMatmulConfig(MM_CFG).isPerTensor) {
            AntiQuantComputePerTensor(quantOut, quantIn, sharedLocal);
        } else {
            uint32_t groupNum = 1;
            AntiQuantShapeInfo shapeInfo;
            if (IsTranspose()) {
                uint32_t quantN = CeilAlignT<int32_t>(GetBaseUseN(), ANTI_QUANT_ALIGN_SIZE_MM_API);
                if constexpr (ToMatmulConfig(MM_CFG).hasAntiQuantOffset) {
                    shapeInfo.offsetHeight = quantN;
                    shapeInfo.offsetWidth = groupNum;
                }
                shapeInfo.scaleHeight = quantN;
                shapeInfo.scaleWidth = groupNum;
            } else {
                int quantN = CeilAlignT<int32_t>(GetBaseUseN(), ANTI_QUANT_ALIGN_SIZE_MM_API);
                uint32_t padNSize = isBankConflict ? quantN + ANTI_QUANT_ALIGN_SIZE_MM_API : quantN;
                if constexpr (ToMatmulConfig(MM_CFG).hasAntiQuantOffset) {
                    shapeInfo.offsetHeight = groupNum;
                    shapeInfo.offsetWidth = padNSize;
                }
                shapeInfo.scaleHeight = groupNum;
                shapeInfo.scaleWidth = padNSize;
            }
            if constexpr (ToMatmulConfig(MM_CFG).hasAntiQuantOffset) {
                if (IsTranspose()) {
                    AscendAntiQuant<SRC_T, TRANS_T, true>(
                        quantOut, quantIn, GetAntiQuantOffsetTensor(),
                        GetAntiQuantScaleTensor(), sharedLocal,
                        CeilAlignT<int32_t>(GetBaseUseStepKb(), ANTI_QUANT_ALIGN_SIZE_MM_API), shapeInfo);
                } else {
                    AscendAntiQuant<SRC_T, TRANS_T, false>(
                        quantOut, quantIn, GetAntiQuantOffsetTensor(),
                        GetAntiQuantScaleTensor(), sharedLocal,
                        CeilAlignT<int32_t>(GetBaseUseStepKb(), ANTI_QUANT_ALIGN_SIZE_MM_API), shapeInfo);
                }
            } else {
                if (IsTranspose()) {
                    AscendAntiQuant<SRC_T, TRANS_T, true>(
                        quantOut, quantIn, GetAntiQuantScaleTensor(), sharedLocal,
                        CeilAlignT<int32_t>(GetBaseUseStepKb(), ANTI_QUANT_ALIGN_SIZE_MM_API), shapeInfo);
                } else {
                    AscendAntiQuant<SRC_T, TRANS_T, false>(
                        quantOut, quantIn, GetAntiQuantScaleTensor(), sharedLocal,
                        CeilAlignT<int32_t>(GetBaseUseStepKb(), ANTI_QUANT_ALIGN_SIZE_MM_API), shapeInfo);
                }
            }
        }
#endif
    }

    __aicore__ inline void AntiQuantComputePerTensor(const LocalTensor<TRANS_T>& quantOut,
                                                     const LocalTensor<SRC_T>& quantIn,
                                                     const LocalTensor<uint8_t>& sharedLocal)
    {
#if __CCE_AICORE__ == 200
        if constexpr (ToMatmulConfig(MM_CFG).hasAntiQuantOffset) {
            if (IsTranspose()) {
                AscendAntiQuant<SRC_T, TRANS_T, true>(
                    quantOut, quantIn, GetAntiQuantOffsetScalar(),
                    GetAntiQuantScaleScalar(), sharedLocal,
                    CeilAlignT<int32_t>(GetBaseUseStepKb(), ANTI_QUANT_ALIGN_SIZE_MM_API));
            } else {
                AscendAntiQuant<SRC_T, TRANS_T, false>(
                    quantOut, quantIn, GetAntiQuantOffsetScalar(),
                    GetAntiQuantScaleScalar(), sharedLocal,
                    CeilAlignT<int32_t>(GetBaseUseStepKb(), ANTI_QUANT_ALIGN_SIZE_MM_API));
            }
        } else {
            if (IsTranspose()) {
                AscendAntiQuant<SRC_T, TRANS_T, true>(
                    quantOut, quantIn, GetAntiQuantScaleScalar(), sharedLocal,
                    CeilAlignT<int32_t>(GetBaseUseStepKb(), ANTI_QUANT_ALIGN_SIZE_MM_API));
            } else {
                AscendAntiQuant<SRC_T, TRANS_T, false>(
                    quantOut, quantIn, GetAntiQuantScaleScalar(), sharedLocal,
                    CeilAlignT<int32_t>(GetBaseUseStepKb(), ANTI_QUANT_ALIGN_SIZE_MM_API));
            }
        }
#endif
    }

    __aicore__ inline LocalTensor<uint8_t> GetSharedLocal()
    {
        LocalTensor<uint8_t> sharedLocal;
        if (IsTranspose()) {
            int32_t scaleUbSize = MATMUL_MODULE(CopyCubeInParams)->template GetBaseWidth() * 2 + 32 * 2;
            int32_t tmpBuffSize = 16 * MATMUL_MODULE(CopyCubeInParams)->template GetBaseWidth() * 2 * sizeof(TRANS_T);
            ASCENDC_ASSERT((MATMUL_MODULE(MatmulShapeTiling)->GetTiling().GetTransLength() > (tmpBuffSize + scaleUbSize)), {
                KERNEL_LOG(KERNEL_ERROR, "transLength(%d) must be larger than tmpBuffSize(%d) + scaleUbSize(%d)",
                           MATMUL_MODULE(MatmulShapeTiling)->GetTiling().GetTransLength(), tmpBuffSize, scaleUbSize);
            });
            int32_t tmpBuffOffset = MATMUL_MODULE(MatmulShapeTiling)->GetTiling().GetTransLength() - tmpBuffSize - scaleUbSize;
            sharedLocal =
                MATMUL_MODULE(LocalWorkspace)->GetWorkspaceWithOffset(tmpBuffOffset).template ReinterpretCast<uint8_t>();
            sharedLocal.SetSize(tmpBuffSize);
        }
        return sharedLocal;
    }

private:

    __aicore__ inline LocalTensor<TRANS_T> GetAntiQuantOffsetTensor() const
    {
        return MATMUL_CONST_PARAM_VAR.antiQuantOffsetTensor_;
    }

    __aicore__ inline LocalTensor<TRANS_T> GetAntiQuantScaleTensor() const
    {
        return MATMUL_CONST_PARAM_VAR.antiQuantScaleTensor_;
    }

    __aicore__ inline TRANS_T GetAntiQuantOffsetScalar() const
    {
        return MATMUL_CONST_PARAM_VAR.antiQuantOffsetScalar_;
    }

    __aicore__ inline TRANS_T GetAntiQuantScaleScalar() const
    {
        return MATMUL_CONST_PARAM_VAR.antiQuantScaleScalar_;
    }

    constexpr static int32_t c0Size_ = AuxGetC0Size<TRANS_T>();
    typename CubeInQueType<INPUT_TYPE>::QUE qidUBCache_;
    LocalTensor<SRC_T> cacheHead2UB_; // Allocate and release using qidUBCache_
    int32_t cache2UBProc_ = 0;
};
}  // namespace Detail
}  // namespace Impl
}  // namespace AscendC
#endif // IMPL_MATMUL_MODULES_STAGE_COPY_CUBE_IN_COPY_CUBE_IN_SET_UB_H
